186 lines
4.5 KiB
TeX
186 lines
4.5 KiB
TeX
\documentclass{beamer}
|
||
\usepackage{graphicx}
|
||
\usepackage{multicol}
|
||
\usepackage{hyperref}
|
||
\usepackage{verbatim}
|
||
\usepackage{lipsum} % for placeholder text
|
||
\graphicspath{{pdf_images/}}
|
||
|
||
|
||
\title{ECON 4530/5530 \\ Computational Economics}
|
||
\subtitle{Joining data and Data Transformation}
|
||
\author{Alex Gebben}
|
||
|
||
\begin{document}
|
||
|
||
% Title Slide
|
||
\begin{frame}
|
||
\titlepage
|
||
\end{frame}
|
||
%%%%%%%%%%%%%%%%%%
|
||
\begin{frame}{Joining Data}
|
||
\begin{itemize}
|
||
\item Real-world data often comes in multiple tables.
|
||
\item Joins combine related data based on common keys.
|
||
\item \texttt{dplyr} provides intuitive functions for joining.
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% Slide 2
|
||
\begin{frame}{Types of Joins}
|
||
\begin{itemize}
|
||
\item \texttt{left\_join()} – keep all rows from left table
|
||
\item \texttt{right\_join()} – keep all rows from right table
|
||
\item \texttt{inner\_join()} – keep only matching rows
|
||
\item \texttt{full\_join()} – keep all rows from both tables
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% Slide 3
|
||
\begin{frame}{left\_join()}
|
||
\texttt{left\_join(df1, df2, by = "id")}
|
||
\newline
|
||
OR
|
||
\newline
|
||
\onslide<2->{\texttt{df1 \%>\% left\_join(df2)}}
|
||
\begin{itemize}
|
||
\onslide<3->{ \item Keeps all rows from the left table.}
|
||
\onslide<4->{ \item Adds matching rows from the right table.}
|
||
\onslide<5->{ \item Missing matches are filled with \texttt{NA}.}
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% Slide 4
|
||
\begin{frame}{right\_join()}
|
||
|
||
\texttt{right\_join(df1, df2, by = "id")}
|
||
\newline
|
||
OR
|
||
\newline
|
||
\onslide<2->{\texttt{df2 \%>\% left\_join(df1)}}
|
||
\onslide<3->{
|
||
\begin{itemize}
|
||
\item Keeps all rows from the right table.
|
||
\item Adds matching rows from the left table.
|
||
\end{itemize}
|
||
}
|
||
\end{frame}
|
||
|
||
% Slide 5
|
||
\begin{frame}{inner\_join()}
|
||
\texttt{inner\_join(df1, df2, by = "id")}
|
||
\newline
|
||
OR
|
||
\newline
|
||
\texttt{df1 \%>\% inner\_join(df2, by = "id")}
|
||
\begin{itemize}
|
||
\item Keeps only rows with matching keys in both tables.
|
||
\item Most commonly used for filtering to shared data.
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% Slide 6
|
||
\begin{frame}{full\_join()}
|
||
\texttt{full\_join(df1, df2, by = "id")}
|
||
\newline
|
||
OR
|
||
\newline
|
||
\texttt{df1 \%>\% full\_join(df2, by = "id")}
|
||
|
||
\begin{itemize}
|
||
\item Keeps all rows from both tables.
|
||
\item Missing matches are filled with \texttt{NA}.
|
||
\end{itemize}
|
||
\end{frame}
|
||
\begin{frame}{Common Issues}
|
||
\begin{itemize}
|
||
\item Mismatched column names
|
||
\item Duplicate keys – can lead to unexpected row duplication
|
||
\item Data types must match, both keys should be character or numeric
|
||
\item Missing value joins will introduce \texttt{NA}s
|
||
\end{itemize}
|
||
|
||
\end{frame}
|
||
|
||
\begin{frame}[plain]
|
||
\includegraphics[width=\textwidth]{venn.png}
|
||
\end{frame}
|
||
|
||
\begin{frame}[plain]
|
||
\begin{columns}
|
||
\begin{column}{0.5\textwidth}
|
||
\includegraphics[width=\textwidth]{1_join}
|
||
\end{column}
|
||
\begin{column}{0.5\textwidth}
|
||
\includegraphics[width=\textwidth]{2_join}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\begin{frame}[plain]
|
||
\includegraphics[width=\textwidth]{3_join}
|
||
\end{frame}
|
||
\begin{frame}[plain]
|
||
\includegraphics[width=\textwidth]{full.png}
|
||
\end{frame}
|
||
\begin{frame}[plain]
|
||
\includegraphics[width=\textwidth]{left.png}
|
||
\end{frame}
|
||
\begin{frame}[plain]
|
||
\includegraphics[width=\textwidth]{right.png}
|
||
\end{frame}
|
||
|
||
% Slide 7
|
||
\begin{frame}[plain]
|
||
\includegraphics[width=\textwidth]{match-types.png}
|
||
\end{frame}
|
||
\begin{frame}{Checking with Anti-Joins}
|
||
\includegraphics[width=\textwidth]{anti.png}
|
||
\end{frame}
|
||
\begin{frame}{Checking with Anti-Joins}
|
||
\includegraphics[width=\textwidth]{anti.png}
|
||
\end{frame}
|
||
\begin{frame}{Filtering with semi-joins}
|
||
\includegraphics[width=\textwidth]{semi.png}
|
||
\end{frame}
|
||
\begin{frame}{Other non-equa joins}
|
||
\includegraphics[width=\textwidth]{gte}
|
||
\end{frame}
|
||
|
||
% Slide 8
|
||
\begin{frame}{Best Practices}
|
||
\begin{itemize}
|
||
\item Inspect keys before joining: \texttt{unique()}
|
||
\item Use \texttt{anti\_join()} to find unmatched rows
|
||
\item Validate results with \texttt{summary()} and \texttt{count()}
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% Slide 9
|
||
\begin{frame}{Class Exercise}
|
||
\begin{itemize}
|
||
\item Joins are essential for combining data.
|
||
\item Choose the right join based on your goal.
|
||
\item Always check for common issues before and after joining.
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Pivot}
|
||
\centering
|
||
\includegraphics[width=0.6\textwidth]{Pivot_Data.png}
|
||
\end{frame}
|
||
\begin{frame}{Pivot longer}
|
||
\centering
|
||
\includegraphics[width=\textwidth]{pivot_longer.png}
|
||
\end{frame}
|
||
\begin{frame}{Pivot longer}
|
||
\centering
|
||
\includegraphics[width=\textwidth]{column-names.png}
|
||
\end{frame}
|
||
\begin{frame}{Pivot longer}
|
||
\centering
|
||
\includegraphics[width=\textwidth]{cell-values.png}
|
||
\end{frame}
|
||
|
||
\end{document}
|