Made data cleaning easier. Added a VAR

This commit is contained in:
Alex Gebben Work 2025-09-09 17:12:57 -06:00
parent 60127fb003
commit 6162487b95
3 changed files with 123 additions and 65 deletions

View File

@ -1,82 +1,86 @@
library(rvest)
library(tidyverse) library(tidyverse)
library(fixest) library(fixest)
#Data found on the page http://eadiv.state.wy.us/pop/ source("Scripts/Functions.r")
PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm") #source("Scripts/Load_Wyoming_Web_Data.r")
NODE <- html_element(PAGE ,"table") DATA_TO_GATHER <- list()
TBL <- html_table(NODE) DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c("WYPOP","WY_POP",TRUE)
ST <- which(toupper(TBL$X1)=="ALBANY") DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c("WYNQGSP","WY_GDP",TRUE)
END <- which(toupper(TBL$X1)=="TOTAL") DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c("MEHOINUSWYA646N","WY_MED_INCOME",TRUE)
TYPES <- TBL[ST-2,1] DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c("BUSAPPWNSAWY","WY_BUISNESS_APPLICATIONS",FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('ACTLISCOUWY','WY_HOUSES_FOR_SALE',FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('WYRVAC','WY_RENTAL_VACANCY_RATE',FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('WYBPPRIVSA','WY_PRIVATE_HOUSING',FALSE)
#New Private Housing Units Authorized by Building Permits for Wyoming
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('B03002006E056023','LN_FIVE_YEAR_POP',FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('GDPALL56023','LN_GDP',TRUE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('WYLINC3POP','LN_POP',FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('LAUCN560230000000005','LN_EMPLOYMENT',FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('BPPRIV056023','LN_PRIVE_HOUSING',FALSE)
ST_YEAR <- 1971 DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('ENU5602320510','LN_NUM_ESTABLISHMENTS',FALSE)
ALL_DATA <- list()
TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))] DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('GDPALL56041','UINTA_GDP',TRUE)
TBL <- TBL[,-ncol(TBL)] DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('WYUINT1POP','UINTA_POP',FALSE)
colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1))) DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('WYSUBL5POP','SUBLETTE_POP',FALSE)
TBL$Type <- NA DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('WYSWEE7POP','SWEETWATER_POP',FALSE)
for(i in 1:length(ST)){ DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('WYTETO9POP','TETON_POP',FALSE)
TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1]) ##Idaho Counties
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('IDBEAR7POP','BEAR_LAKE_POP',FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('IDCARI9POP','CARIBOU_POP',FALSE)
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('IDBONN0POP','BONNEVILLE_POP',FALSE)
###US Population
DATA_TO_GATHER[[length(DATA_TO_GATHER)+1]] <- c('POPTOTUSA647NWDB','US_POP',FALSE)
for(x in 1:length(DATA_TO_GATHER)){
CURRENT <- DATA_TO_GATHER[[x]]
if(CURRENT[3]){C_DATA <- CPI_ADJUST(FRED_GET(CURRENT[1],CURRENT[2]))}else{C_DATA <- FRED_GET(CURRENT[1],CURRENT[2])}
if(x==1){RES <- C_DATA}else{RES <- RES %>% full_join(C_DATA)}
rm(CURRENT,C_DATA)
} }
TBL[ST[2]:END[2],"Type"] <- as.character(TYPES[2,1]) DATA <- RES %>% mutate(US_POP=US_POP-WY_POP,WY_POP=WY_POP-LN_POP-UINTA_POP-SUBLETTE_POP-SWEETWATER_POP-TETON_POP)
TBL$Type colnames(DATA)
TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything()) feols(log(LN_POP) ~log(US_POP)+log(UINTA_POP)+log(WY_POP)+log(SUBLETTE_POP)+log(SWEETWATER_POP)+log(TETON_POP)+log(BEAR_LAKE_POP)+log(CARIBOU_POP)+log(BONNEVILLE_POP)+YEAR,data=DATA)
GROUP <- colnames(TBL)[-1:-2] TS_DATA_ORIG <- DATA %>% select(YEAR,LN_POP,US_POP,WY_POP,UINTA_POP,SUBLETTE_POP,SWEETWATER_POP,TETON_POP,BEAR_LAKE_POP,CARIBOU_POP,BONNEVILLE_POP) %>% filter(!is.na(LN_POP)) %>% arrange(YEAR) %>% select(-YEAR)
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
Data <- pivot_wider(Data,names_from=Type,values_from=Pop_Change)
colnames(Data)[5] <-"Migration"
HOUSE_INCOME <- read_csv("https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=MEHOINUSWYA672N&scale=left&cosd=1984-01-01&coed=2023-01-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=1984-01-01")
HOUSE_INCOME <-
colnames(HOUSE_INCOME) <- c("Year","WY_INCOME")
HOUSE_INCOME <- HOUSE_INCOME %>% mutate(Year=year(Year))
#Employment in construction of Wyoming TS_DATA <- diff(log(ts(TS_DATA_ORIG,start=c(1970),end=c(2024),frequency=1)))
CON_EMP <- read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WYCONS&scale=left&cosd=1990-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=1990-01-01') library("forecast")
colnames(CON_EMP) <- c("Year","WY_CON_EMP") library("vars")
CON_EMP <- CON_EMP%>% mutate(Year=year(Year)) VARselect(TS_DATA,lag.max=4,type="const")
LINC_UNEMP <- 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WYLINC3URN&scale=left&cosd=1990-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=1990-01-01' VAR1 <- VAR(TS_DATA,p=3,type="const",season=NULL,exog=NULL)
LINC_GDP <- 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=REALGDPALL56023&scale=left&cosd=2001-01-01&coed=2023-01-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=2001-01-01' plot(irf(VAR1,response="LN_POP"))
LINC_GDP<- read_csv(LINC_GDP) plot(forecast(VAR1,))
colnames(LINC_GDP) <- c("Year","LINC_GDP") RES <- (predict(VAR1, n.ahead = 20, ci = 0.95))
LINC_GDP <- LINC_GDP %>% mutate(Year=year(Year)) names(RES )
names(RES$fcst)
RES$fcst$LN_POP %>% as_tibble
LINC_PRIV_IND <- 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=ENU5602320510&scale=left&cosd=1990-01-01&coed=2024-10-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=1990-01-01' CURRENT_POP <- max(DATA$LN_POP,na.rm=TRUE)
LINC_PRIV_IND <- read_csv(LINC_PRIV_IND) 0.0157*CURRENT_POP*1000
LINC_PRIV_IND 0.083*CURRENT_POP*1000
colnames(LINC_PRIV_IND) <- c("Year","LINC_PRIV_IND") -0.0489*CURRENT_POP*1000
LINC_PRIV_IND <- LINC_PRIV_IND %>% mutate(Year=year(Year))
LINC_LABOR_FORCE <- 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WYLINC3LFN&scale=left&cosd=1990-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=1990-01-01'
LINC_LABOR_FORCE<- read_csv(LINC_LABOR_FORCE)
colnames(LINC_LABOR_FORCE) <- c("Year","LINC_LABOR_FORCE")
LINC_LABOR_FORCE<- LINC_LABOR_FORCE%>% mutate(Year=year(Year))
LINC_GOV_GDP <- 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=GDPGOVT56023&scale=left&cosd=2001-01-01&coed=2023-01-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=2001-01-01'
LINC_GOV_GDP <- read_csv(LINC_GOV_GDP) # View the forecasted values and confidence intervals
colnames(LINC_GOV_GDP) <- c("Year","LINC_GOV_GDP") print(forecast_results)
LINC_GOV_GDP<-LINC_GOV_GDP%>% mutate(Year=year(Year))
# You can also plot the forecasts
plot(forecast_results)
LINC_POP <- 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WYLINC3POP&scale=left&cosd=1970-01-01&coed=2024-01-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=1970-01-01' ?forecast
LINC_POP <- read_csv(LINC_POP) ?forecast
colnames(LINC_POP) <- c("Year","LINC_POP") install.packages("sparsegl")
LINC_POP <- LINC_POP %>% mutate(Year=year(Year)) plot(VAR1)
WY_POP <- 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WYPOP&scale=left&cosd=1900-01-01&coed=2024-01-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-08-29&revision_date=2025-08-29&nd=1900-01-01'
WY_POP <- read_csv(WY_POP)
colnames(WY_POP) <- c("Year","WY_POP")
WY_POP <- WY_POP %>% mutate(Year=year(Year))
LINC_PRIV_IND
DF <- LINC_POP %>% left_join(WY_POP) %>% left_join(LINC_PRIV_IND) %>% left_join(LINC_LABOR_FORCE) %>% left_join(LINC_GOV_GDP) %>% left_join(LINC_GDP) %>% left_join(WY_POP)
DF %>% select(LINC_POP,WY_POP)
DF <- DF %>% mutate(WY_POP=WY_POP-LINC_POP)
#Check a VAR it looks like lags on changes to Private industry could affect other variables #Check a VAR it looks like lags on changes to Private industry could affect other variables
#Idea check a SVAR placing limits on which shocks are first #Idea check a SVAR placing limits on which shocks are first
feols((LINC_POP)~(WY_POP)+log(LINC_PRIV_IND)+Year,data=DF) feols((LINC_POP)~(WY_POP)+log(LINC_PRIV_IND)+Year,data=DF)

26
Scripts/Functions.r Normal file
View File

@ -0,0 +1,26 @@
#A function to extract date from FRED. Assumes the date is either annual or monthly.
FRED_GET <- function(FRED_SERIES_ID,NAME=NA,ST_DATE='1890-01-91',END_DATE=Sys.Date(),ANNUAL_DATA=TRUE){
NAME <- ifelse(is.na(NAME),FRED_SERIES_ID,NAME)
DATA_TIME_FRAME <- ifelse(ANNUAL_DATA,"Annual","Monthly")
URL <- paste0('https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=827&nt=0&thu=0&trc=0&show_legend=no&show_axis_titles=no&show_tooltip=no&id=',FRED_SERIES_ID,'&scale=left&cosd=',ST_DATE,'&coed=',END_DATE,'&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=',DATA_TIME_FRAME,'&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=',END_DATE,'&revision_date=',END_DATE,'&nd=',END_DATE)
DATA <- read_csv(URL)
DATA <- DATA[which(!is.na(DATA[,2] )),]
colnames(DATA)[2] <- NAME
if(ANNUAL_DATA){
colnames(DATA)[1] <- "YEAR"
DATA$YEAR <- year(DATA$YEAR)
}else{
colnames(DATA)[1] <- "DATE"
}
return(DATA)
}
CPI_ADJUST <- function(DATA){
CPI <- FRED_GET("CPIAUCSL") %>% filter(!is.na(CPIAUCSL))
CPI$CPI_ADJ <- as.numeric(CPI[which.max(CPI$YEAR),2])/CPI$CPIAUCSL
CPI <- CPI %>% select(-"CPIAUCSL")
DATA <- DATA %>% left_join(CPI)
DATA[,2] <- DATA[,2]*DATA[,"CPI_ADJ"]
DATA <- DATA%>% select(-"CPI_ADJ")
return(DATA)
}

View File

@ -0,0 +1,28 @@
library(rvest)
#Data found on the page http://eadiv.state.wy.us/pop/
PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm")
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)=="ALBANY")
END <- which(toupper(TBL$X1)=="TOTAL")
TYPES <- TBL[ST-2,1]
ST_YEAR <- 1971
ALL_DATA <- list()
TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))]
TBL <- TBL[,-ncol(TBL)]
colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1)))
TBL$Type <- NA
for(i in 1:length(ST)){
TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1])
}
TBL[ST[2]:END[2],"Type"] <- as.character(TYPES[2,1])
TBL$Type
TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything())
GROUP <- colnames(TBL)[-1:-2]
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change)
colnames(Data)[5] <-"Migration"