\documentclass[xcolor = pdftex,dvipsnames,table]{beamer}

\usetheme{Berkeley}

\title{Classification and Regression Trees: Introduction to CART}

\author{Lawrence Hubert}

\begin{document}

\begin{frame}

\titlepage

\end{frame}

\begin{frame}

\frametitle{The Basic Setup}

An $N \times p$ matrix (of predictors), $\mathbf{X}$; $N$ is the number of subjects; $p$ is the number of variables.

\smallskip

An $N \times 1$ vector (to be predicted; the ``predictee''), $\mathbf{Y}$.

\bigskip

If $\mathbf{Y}$ contains nominal (categorical) data (with, say, $K$ categories), we will construct a Classification Tree;

\smallskip

a ``classifier'' uses the data in $\mathbf{X}$ to place a row in $\mathbf{X}$ into a specific category.




\end{frame}






\begin{frame}

\frametitle{}

If $\mathbf{Y}$ contains numerical values (that are not just used for labeling), we will construct a Regression Tree.

\bigskip

As an example. see the Medical Admissions (binary) tree in the SYSTAT manual on CART that you have.

\end{frame}






\begin{frame}

\frametitle{Finding the Binary Splits}

For each numerical (predictor) variable, order its values from smallest to largest and look at the $N - 1$ possible splits between adjacent values.

\bigskip

For each categorical (predictor) variable (with, say, $K$ categories), evaluate all possible $2^{K-1} - 1$ splits of the category codes into two groups.

\bigskip

The best split is chosen to minimize the ``impurity'' of the two resulting subsets that are formed.

\end{frame}






\begin{frame}

\frametitle{Default Measures of Node Impurity}

For numerical $\mathbf{Y}$, use the within sum-of-squares for the two groups formed by the split;

\smallskip

For categorical $\mathbf{Y}$, use the sum of the Gini diversity indices (``gdi'') over the two groups:

\smallskip

for one group with proportions, $p_{1}, \ldots, p_{K}$, over the $K$ groups, gdi $= 1 - \sum_{k=1}^{K} p_{k}^{2}$.

\smallskip

Thus, gdi = 0 when one proportion is 1; gdi is maximal when the proportions are all equal.

\end{frame}






\begin{frame}

\frametitle{A Little History and Terminology}

Morgan and Sonquist were the first (in the 1960s) to suggest this type of ``recursive partitioning'';

\smallskip

they called it AID for ``Automatic Interaction Detection''.

\bigskip

The major R routine for this is in the ``rpart'' set of programs.

\bigskip

We could say it is ``stagewise'' and not ``stepwise'' -- once a split is made, we don't revisit it.

\end{frame}






\begin{frame}

\frametitle{}

Also, the procedure is myopic, in that it only looks one step ahead;

\bigskip

 there is no guarantee of any overall optimality for the trees constructed.

 \bigskip

 We are looking for a good classifier that ``stands up'' to test samples and/or cross-validation.

 \bigskip

 Remember, a ``good fit'' does not necessarily mean a ``good model''.

 \bigskip

 The major reference and reason for current popularity:

 \emph{Classification and Regression Trees} (1984) (Breiman; Friedman; Olshen; Stone)

\end{frame}






\begin{frame}

\frametitle{How to Use the Tree for Prediction}

For a numerical $\mathbf{Y}$, we can use the mean of the values from $\mathbf{Y}$ within the terminal subsets (nodes);

\bigskip

For categorical $\mathbf{Y}$, we can use the category with the greatest proportion (the majority or plurality) of observations in the terminal subsets (nodes).

\bigskip

We could also impose differential costs of misclassification or different prior probabilities of group membership --  this is much like what we can do in using discriminant functions.

\end{frame}






\begin{frame}

\frametitle{Confusion Errors}

Suppose two categories in $\mathbf{Y}$ (``success'' and ``failure''):

\bigskip

\begin{tabular}{ccc}

 & Failure Prediction & Success Predicted \\

 Failure & a & b \\

 Success & c & d \\


\end{tabular}

\bigskip

overall error: $(b + c)/(a + b + c + d)$

model errors:  $b/(a + b)$; $c/(c + d)$

usage errors: $c/(a + c)$; $b/(b + d)$

\end{frame}






\begin{frame}

\frametitle{How to Evaluate Accuracy}

a) $k$-fold cross-validation (the default value of $k$ is usually 10)

\bigskip

the extreme of $k = N$ is the ``leave-one-out'' option

\bigskip

b) bootstrap (about 1/3 of the observations are not resampled and can be used for cross-validation)

these are called the ``out-of-bag'' (OOB) observations

\bigskip

In either case, one ``drops down'' the tree the unsampled cases to see how well one does.

\end{frame}






\begin{frame}

\frametitle{}

All of this is very engineering oriented.  The emphasis is mainly on whether it ``works'' and not on the ``why'' or ``how''.

\bigskip

Thus, clever mechanisms to evaluate accuracy are crucial; cross-validation is central to the CART enterprise.

\bigskip

For the behavioral sciences, we typically are also interested in the predictive structure of the problem, i.e., the ``how'' and ``why''.

\end{frame}






\begin{frame}

\frametitle{Issues}

How ``deep'' should the tree be grown (the ``goldilocks'' problem):

too deep, we ``overfit'' and get unstable prediction;

not deep enough, we get inaccurate prediction.

\bigskip

Strategies:

a) Choose minimal leaf size by a cross-validation (sample reuse) mechanism;

\bigskip

b) Draw a deep tree and ``prune'' back to get to the best cross-validation level.

\bigskip

A tree with one case per terminal node is called ``saturated''.

\end{frame}






\begin{frame}

\frametitle{Older Terminology}

A ``training sample'' (we are training the learner or classifier if we have a categorical $\mathbf{Y}$).

\bigskip

There is a ``test sample'' of new data not used in the training that serves the purposes of cross-validation (and to assess ``shrinkage'').

\bigskip

If we ``drop'' the training sample down the tree, we get the ``resubstitution estimate of error''.

\bigskip

This is an overly optimistic estimate since the same data used to obtain the tree now serves as the means to evaluate it.

\end{frame}






\begin{frame}

\frametitle{}

Going to a saturated tree gives zero resubstitution error but it is terribly overfit and unstable;

\bigskip

that is the reason for pruning back, or evaluating leaf size through cross-validation.

\end{frame}






\begin{frame}

\frametitle{Variable Combinations}

We could include a variety of linear combinations of the original variables in the predictor set.

\bigskip

We get closer to a linear discriminant situation that uses separating hyperplanes.

\bigskip

Otherwise, splits are all perpendicular to the coordinate axes.

\end{frame}






\begin{frame}

\frametitle{Surrogate Splits}

If we have missing data on some predictor variable for an object, and we don't know which class the object should be assigned to when that predictor is used for a particular split, we can use a similar split on another variable that is ``close'' --

we use these (surrogate) splits to assign the object to the class.

\end{frame}






\begin{frame}

\frametitle{Extensions of CART to Tree Ensembles}

Boosting -- this refers to a variety of methods for reweighting hard to classify objects, and redoing the training.

\bigskip

Bagging -- this stands for bootstrap aggregation; multiple trees are produced and averages are taken over the trees for prediction purposes; out-of-bag observations are used for evaluation.

Random Forests --  use random subsets of the predictors to grow the trees.

\end{frame}






\begin{frame}

\frametitle{Berk's Summary Comment}

Random forests and boosting are probably state-of-the-art forecasting tools.

\end{frame}










\end{document}