2025-09-12 13:58:32 -06:00

172 lines
4.1 KiB
TeX
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

\documentclass{beamer}
\usepackage{graphicx}
\usepackage{multicol}
\usepackage{hyperref}
\usepackage{verbatim}
\usepackage{lipsum} % for placeholder text
\graphicspath{{pdf_images/}}
\title{ECON 4530/5530 \\ Computational Economics}
\subtitle{Joining data and Data Transformation}
\author{Alex Gebben}
\begin{document}
% Title Slide
\begin{frame}
\titlepage
\end{frame}
%%%%%%%%%%%%%%%%%%
\begin{frame}{Joining Data}
\begin{itemize}
\item Real-world data often comes in multiple tables.
\item Joins combine related data based on common keys.
\item \texttt{dplyr} provides intuitive functions for joining.
\end{itemize}
\end{frame}
% Slide 2
\begin{frame}{Types of Joins}
\begin{itemize}
\item \texttt{left\_join()} keep all rows from left table
\item \texttt{right\_join()} keep all rows from right table
\item \texttt{inner\_join()} keep only matching rows
\item \texttt{full\_join()} keep all rows from both tables
\end{itemize}
\end{frame}
% Slide 3
\begin{frame}{left\_join()}
\texttt{left\_join(df1, df2, by = "id")}
\newline
OR
\newline
\onslide<2->{\texttt{df1 \%>\% left\_join(df2)}}
\begin{itemize}
\onslide<3->{ \item Keeps all rows from the left table.}
\onslide<4->{ \item Adds matching rows from the right table.}
\onslide<5->{ \item Missing matches are filled with \texttt{NA}.}
\end{itemize}
\end{frame}
% Slide 4
\begin{frame}{right\_join()}
\texttt{right\_join(df1, df2, by = "id")}
\newline
OR
\newline
\onslide<2->{\texttt{df2 \%>\% left\_join(df1)}}
\onslide<3->{
\begin{itemize}
\item Keeps all rows from the right table.
\item Adds matching rows from the left table.
\end{itemize}
}
\end{frame}
% Slide 5
\begin{frame}{inner\_join()}
\texttt{inner\_join(df1, df2, by = "id")}
\newline
OR
\newline
\texttt{df1 \%>\% inner\_join(df2, by = "id")}
\begin{itemize}
\item Keeps only rows with matching keys in both tables.
\item Most commonly used for filtering to shared data.
\end{itemize}
\end{frame}
% Slide 6
\begin{frame}{full\_join()}
\texttt{full\_join(df1, df2, by = "id")}
\newline
OR
\newline
\texttt{df1 \%>\% full\_join(df2, by = "id")}
\begin{itemize}
\item Keeps all rows from both tables.
\item Missing matches are filled with \texttt{NA}.
\end{itemize}
\end{frame}
\begin{frame}{Common Issues}
\begin{itemize}
\item Mismatched column names
\item Duplicate keys can lead to unexpected row duplication
\item Data types must match, both keys should be character or numeric
\item Missing value joins will introduce \texttt{NA}s
\end{itemize}
\end{frame}
\begin{frame}[plain]
\includegraphics[width=\textwidth]{venn.png}
\end{frame}
\begin{frame}[plain]
\begin{columns}
\begin{column}{0.5\textwidth}
\includegraphics[width=\textwidth]{1_join}
\end{column}
\begin{column}{0.5\textwidth}
\includegraphics[width=\textwidth]{2_join}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[plain]
\includegraphics[width=\textwidth]{3_join}
\end{frame}
\begin{frame}[plain]
\includegraphics[width=\textwidth]{full.png}
\end{frame}
\begin{frame}[plain]
\includegraphics[width=\textwidth]{left.png}
\end{frame}
\begin{frame}[plain]
\includegraphics[width=\textwidth]{right.png}
\end{frame}
% Slide 7
\begin{frame}[plain]
\includegraphics[width=\textwidth]{match-types.png}
\end{frame}
\begin{frame}{Checking with Anti-Joins}
\includegraphics[width=\textwidth]{anti.png}
\end{frame}
\begin{frame}{Checking with Anti-Joins}
\includegraphics[width=\textwidth]{anti.png}
\end{frame}
\begin{frame}{Filtering with semi-joins}
\includegraphics[width=\textwidth]{semi.png}
\end{frame}
\begin{frame}{Other non-equa joins}
\includegraphics[width=\textwidth]{gte}
\end{frame}
% Slide 8
\begin{frame}{Best Practices}
\begin{itemize}
\item Inspect keys before joining: \texttt{unique()}
\item Use \texttt{anti\_join()} to find unmatched rows
\item Validate results with \texttt{summary()} and \texttt{count()}
\end{itemize}
\end{frame}
% Slide 9
\begin{frame}{Class Exercise}
\begin{itemize}
\item Joins are essential for combining data.
\item Choose the right join based on your goal.
\item Always check for common issues before and after joining.
\end{itemize}
\end{frame}
\end{document}