\documentclass[12pt]{article}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsbsy}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage[english]{babel}
\usepackage{commath}
\usepackage{dsfont}
\usepackage[T1]{fontenc}
\usepackage[margin=1.25in]{geometry}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{cleveref}
\usepackage[utf8]{inputenc}
\usepackage{microtype}
\usepackage{natbib}
\linespread{1.1} % This improves readability.
% Allow use of "blah" instead of ``blah''
\usepackage{csquotes}
\MakeOuterQuote{"}
% Macros for adding comments in the margins
\setlength{\marginparwidth}{2cm}
\usepackage[textsize=tiny]{todonotes}
\newcommand\comment[1]{\todo[color=blue!20!white]{#1}}
% Define header variables here.
\newcommand\coursename{COMS 4774 Spring 2021}
\newcommand\scribe{Daniel Hsu (djh2164)}
\newcommand\editor{Chris Alberti (ca2790)}
\newcommand\lecturedate{September 1, 2017}
\newcommand\lecturetitle{Batch learning via online prediction}
% Define theorem environments here.
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
% Put your custom macros here.
\newcommand\Alg{\mathcal{A}}
\newcommand\Batch{\mathcal{B}}
\newcommand\E{\mathbb{E}}
\newcommand\X{\mathcal{X}}
\newcommand\Prob{\mathbb{P}}
\newcommand\err{\operatorname{err}}
\newcommand\testerr{\widehat{\operatorname{err}}_{\operatorname{test}}}
\newcommand\trainerr{\widehat{\operatorname{err}}_{\operatorname{train}}}
\newcommand\pverr{\widehat{\operatorname{err}}_{\operatorname{pv}}}
\newcommand\OTB[1]{\operatorname{OTB}_{#1}}
\newcommand\ind[1]{\mathds{1}_{\{#1\}}}
\begin{document}
% This creates the header for scribe notes.
\noindent \coursename \hfill Scribe: \scribe \\
\lecturedate \hfill Editor: \editor
\begin{center}
{\Large\lecturetitle}
\end{center}
\section{Introduction}
\label{sec:otb}
One of the major applications of online learning is batch learning
(i.e., offline learning).
In batch learning for binary classification, there is a sequence of
labeled examples
\[
\del{ (x_1,y_1), (x_2,y_2), \dotsc, (x_n,y_n) }
\in (\X \times \{\pm1\})^n
\]
called the ``training data'', and the goal is to find a classifier
with small prediction error.
The training data are assumed to be drawn i.i.d.~from a fixed
probability distribution $\Prob$.
The prediction error of a classifier $f \colon \X \to \{\pm1\}$ is
the probability that the classifier mis-classifies a random example
$(x,y)$ drawn from $\Prob$:
\[
\err(f) := \Prob(f(x) \neq y)
.
\]
A batch learning algorithm takes as input the training data and
returns a classifier.
We'll also talk about randomized classifiers $f$; the
probability defining $\err(f)$ is then taken with respect to both the
internal randomization in $f$ as well as the random draw $(x,y)$ from
$\Prob$.
We abstractly think of an online prediction algorithm as taking as
input a sequence of $n$ labeled examples, and producing a sequence of
classifiers $f_1, f_2, \dotsc, f_n, f_{n+1} \colon \X \to \{\pm1\}$.
The $t$-th classifier $f_t$ can only depend on the first $t-1$ labeled
examples.
Although not all online prediction algorithms fit this template, it is
sufficient for most cases.
\comment{What algorithms don't fit this template?}
\section{Expected prediction error vs mistake bounds}
In expectation, the prediction error of the classifier returned by the
batch learning algorithm $\Batch$ can be written as
\[
\E\del{
\ind{\Batch(S_{1:n})(x_{n+1}) \neq y_{n+1}}
}
\]
where
\[
S_{1:n+1}
:= \del{ (x_1,y_1), (x_2,y_2), \dotsc, (x_{n+1},y_{n+1}) }
\]
is a sequence of $n+1$ labeled examples drawn i.i.d.~from $\Prob$, and
$S_{1:n}$ denotes the sequence of the first $n$ such examples.
If $\Batch$ is a randomized algorithm, then the expectation above is also
taken with respect to the random bits used by $\Batch$.
The following is a simple way to convert an online prediction
algorithm $\Alg$ such as Perceptron and Winnow into a (randomized)
batch learning algorithm $\OTB{\Alg}$ in a way that bounds the expected
prediction error of $\OTB{\Alg}$ in terms of a mistake bound for $\Alg$.
\begin{algorithm}[H]
\caption{Online-to-batch learning algorithm $\OTB{\Alg}$}
\label{alg:otb}
\begin{algorithmic}[1]
\renewcommand\algorithmicrequire{\textbf{input}}
\REQUIRE Training data $S_{1:n} :=
((x_1,y_1),(x_2,y_2),\dotsc,(x_n,y_n))$.
\STATE Run $\Alg$ with the sequence $S_{1:n}$ to produce
classifiers
$f_1, f_2, \dotsc, f_n, f_{n+1}$.
\STATE Pick $T \in \{1,2,\dotsc,n+1\}$ uniformly at random.
\RETURN $f_T$.
\end{algorithmic}
\end{algorithm}
Another way to interpret \Cref{alg:otb} is as follows.
\begin{enumerate}
\item Run $\Alg$ on the training data sequence, and keep all $n+1$
classifiers $f_1, f_2, \dotsc, f_n, f_{n+1}$. generated along the
way.
\item The final classifier used for prediction is a randomized
classifier: to predict on a new instance $x$, first randomly
choose $f_T \in \{f_1,f_2,\dotsc,f_{n+1}\}$, and then predict
$f_T(x)$.
\end{enumerate}
\begin{theorem}
\label{thm:otb}
Let $M_{\Alg,n+1}$ be the number of mistakes made by online
prediction algorithm $\Alg$ on a sequence of $n+1$ i.i.d.~examples
$S_{1:n+1}$ drawn from $\Prob$.
The expected prediction error of $\OTB{\Alg}(S_{1:n})$ is
$\E(M_{\Alg,n+1}) / (n+1)$.
\end{theorem}
\begin{proof}
By direct computation:
\begin{align*}
\E\del{
\ind{\OTB{\Alg}(S_{1:n})(x_{n+1}) \neq y_{n+1}}
}
& = \sum_{t=1}^{n+1} \Pr(T=t) \cdot
\E\del{
\ind{\OTB{\Alg}(S_{1:n})(x_{n+1}) \neq y_{n+1}}
\,|\, T=t
}
\\
& = \sum_{t=1}^{n+1} \frac{1}{n+1}
\E\del{
\ind{f_t(x_{n+1}) \neq y_{n+1}}
}
\\
& = \sum_{t=1}^{n+1} \frac{1}{n+1}
\E\del{
\ind{f_t(x_t) \neq y_t}
}
\\
& = \frac{1}{n+1}
\E\del{
\sum_{t=1}^{n+1}
\ind{f_t(x_t) \neq y_t}
}
.
\end{align*}
The penultimate step is true because $f_t$ is only a function of
$S_{1:t-1}$, and because the distribution of $(S_{1:t-1},(x_{n+1},y_{n+1}))$ is the same as that of
$(S_{1:t-1},(x_t,y_t))$.
The sum in the final expectation is $M_{\Alg,n+1}$.
\end{proof}
\Cref{thm:otb} shows that an online prediction algorithm with a
small mistake bound can be effectively used for batch learning.
If the mistake bound of $\Alg$ for a sequence of $n+1$ examples is
sublinear in $n$, then the expected prediction error of $\OTB{\Alg}$ is
$o(1)$.
Indeed, the mistake bound for Perceptron and Winnow are constant in
the ``realizable'' setting (when there is a half-space function that
is always correct with a margin).
In the ``agnostic'' setting (i.e., when no such perfect half-space
function is assumed to exist), it possible to show that the expected
error of $\OTB{\Alg}$ is at most the optimal hinge-loss achieved by a
half-space function specified by norm-bounded weight vectors, plus a
term that is roughly $O(1/\sqrt{n})$.
Computationally, \Cref{alg:otb} can be quite attractive if
the online prediction algorithm has constant time updates and has a
constant memory requirements (with respect to $n$).
Perceptron and Winnow certainly fit this bill.
In these cases, the running time scales linearly with $n$.
In fact, the algorithm only needs streaming access to the data.
One drawback of \Cref{alg:otb} is that its final classifier
is a randomized classifier: it randomly picks among the $n+1$
classifiers generated by the online prediction algorithm, and uses the
chosen classifier to predict.
However, the randomized classifier can be \emph{derandomized} by using
a majority vote over the $n+1$ classifiers.
The prediction error of the majority-vote classifier is at most twice
the prediction error of the randomized classifier.
Some online prediction algorithms such as Perceptron and Winnow don't
update their weight vectors unless a mistake is made, and thus there
may actually be fewer than $n+1$ different classifiers.
In such cases, it can be more efficient to represent the final
randomized (or majority-vote) classifier as a weighted combination of
these different classifiers.
The weight is proportional to the number of rounds in which it is used
by the online prediction algorithm---specifically, one plus the number
of examples on which it predicts correctly.
In the case of half-space functions, it is reasonable to use a
(weighted) average of the weight vectors corresponding to the
different classifiers.
This can be thought of as an approximation to the majority-vote
classifier.
In some cases, averaging can be rigorously justified and in fact shown
to at least as good as the randomized classifier.
\section{Progressive validation}
Another significant advantage of using online prediction algorithms
for batch learning is that one can estimate the prediction error of
the learned classifier very accurately, without the need for a
separate held-out test set.
This technique is called \emph{progressive validation}.
To motivate progressive validation, we first review how validation is
often performed in batch learning.
Suppose $n$ labeled examples
$S_{1:n} := ((x_1,y_1),(x_2,y_2),\dotsc,(x_n,y_n))$ are available.
These examples are split into two groups: the first $n-m$ examples
$S_{1:n-m}$ and the final $m$ examples $S_{n-m+1:n}$.
The first $n-m$ examples $S_{1:n-m}$ are given to a batch learning
algorithm $\Batch$ to learn a classifier $\hat{f} := \Batch(S_{1:n-m})$.
The last $m$ examples $S_{n-m+1:n}$ are used to compute the \emph{test
error} of $\hat{f}$:
\[
\testerr(\hat{f}) := \frac{1}{m} \sum_{t=n-m+1}^m
\ind{ \hat{f}(x_t) \neq y_t }
.
\]
Under the assumption that the original $n$ examples are drawn
i.i.d.~from some fixed distribution $\Prob$, the test error
$\testerr(\hat{f})$ is an unbiased estimator of the prediction error of
$\hat{f}$:
\[
\E\del{ \testerr(\hat{f}) } = \err(\hat{f}) .
\]
Note that this is not true of the \emph{training error} of $\hat{f}$:
\[
\trainerr(\hat{f}) := \frac{1}{n-m} \sum_{t=1}^{n-m}
\ind{ \hat{f}(x_t) \neq y_t }
.
\]
Indeed, $\hat{f}$ is generally not independent of $S_{1:n-m}$ because
those examples are used by the batch learning algorithm to produce
$\hat{f}$.
However, the last $m$ examples $S_{n-m+1:n}$ are independent of
$\hat{f}$, so
\[
\E\del{ \ind{\hat{f}(x_t) \neq y_t} } = \err(\hat{f})
\]
for $n-m+1 \leq t \leq n$, and hence $\E( \testerr(\hat{f}) ) =
\err(\hat{f})$.
The following proposition is a standard application of
standard binomial tail bounds (e.g., Hoeffding's inequality).
It can be improved considerably, although we shall not discuss it here
because the same improvements will also apply to the approach we
discuss next.
\begin{proposition}
\label{prop:test-bound}
Let $\hat{f} := \Batch(S_{1:n-m})$ for any batch learning algorithm
$\Batch$.
For any $\delta \in (0,1)$,
\[
\Pr\del{
\err(\hat{f}) \leq \testerr(\hat{f})
+ \sqrt{\frac{\ln(1/\delta)}{2m}}
}
\geq 1-\delta
\]
where the probability is taken with respect to $S_{n-m+1:n}$.
\end{proposition}
Progressive validation provides a way to (essentially) use \emph{all
$n$ data} for both training and testing, as long as the training is
done using the online-to-batch procedure $\OTB{\Alg}$ with some online
prediction algorithm $\Alg$.
Let $\hat{f} := \OTB{\Alg}(S_{1:n-1})$ be the final randomized
classifier produced by $\OTB{\Alg}$ using the first $n-1$
examples.
That is, $\hat{f}$ is the randomized classifier that picks one of
$f_1, f_2, \dotsc, f_n$ uniformly at random and uses the chosen
classifier to predict.
The estimator for the prediction error of $\hat{f}$ is
\[
\pverr(\hat{f}) := \frac1n \sum_{t=1}^n \ind{f_t(x_t) \neq y_t}
\]
i.e., the number of mistakes made by $\Alg$ on $S_{1:n}$, divided by
$n$.
\begin{proposition}
\label{prop:pv-bound}
Let $\hat{f} := \OTB{\Alg}(S_{1:n-1})$.
For any $\delta \in (0,1)$,
\[
\Pr\del{
\err(\hat{f}) \leq \pverr(\hat{f})
+ \sqrt{\frac{\ln(1/\delta)}{2n}}
}
\geq 1-\delta
\]
where the probability is taken with respect to $S_{1:n}$.
\end{proposition}
\begin{proof}
For each $t=1,2,\dotsc,n$,
define $Z_t := \err(f_t) - \ind{f_t(x_t) \neq y_t}$.
Since $f_t$ only depends on $S_{1:t-1}$, it follows that
$\E(Z_t\,|\,S_{1:t-1}) = 0$.
Furthermore, $Z_t \leq 1$.
Therefore $Z_1, Z_2, \dotsc, Z_n$ is a martingale difference
sequence.
By the Azuma-Hoeffding inequality, for any $\epsilon>0$
\[
\Pr\del{ \frac1n\sum_{t=1}^n Z_t > \frac{\epsilon}{n} }
\leq \exp\del{-2\epsilon^2/n}
.
\]
The probability bound is equal to $\delta$ when $\epsilon :=
\sqrt{n\ln(1/\delta)/2}$.
Therefore, with probability at least $1-\delta$,
\[
\frac1n \sum_{t=1}^n \err(f_t) - \pverr(\hat{f})
= \frac1n \sum_{t=1}^n
\del{ \err(f_t) - \ind{f_t(x_t) \neq y_t} }
\leq
\sqrt{\frac{\ln(1/\delta)}{2n}}
.
\]
The conclusion follows because the randomized classifier $\hat{f}$
has prediction error $\err(\hat{f}) = \sum_{t=1}^n \err(f_t)/n$.
\end{proof}
Thus, while \Cref{prop:pv-bound} lacks the generality of
\Cref{prop:test-bound}, it compensates by allowing the
training procedure $\OTB{\Alg}$ to use $n-1$ examples and providing an
estimator of the prediction error that deviates by no more than
$O(1/\sqrt{n})$ with high probability.
\section*{Bibliographic notes}
The online-to-batch conversion for expected prediction error is due to
\citet{HelmboldW95}, and the deterministic (majority-vote and
averaging) variants were proposed for use with Perceptron by
\citet{FreundS99perceptron}.
The progressive validation technique was proposed by~\citet{BlumKL99}.
\bibliographystyle{plain}
% A better way to create the bibliography is to use BibTeX.
\begin{thebibliography}{1}
\bibitem[Blum et~al.(1999)Blum, Kalai, and Langford]{BlumKL99}
A.~Blum, A.~Kalai, and J.~Langford.
\newblock Beating the hold-out: Bounds for {K-fold} and progressive cross-validation.
\newblock In \emph{Proceedings of the Twelfth Annual Conference on Computational Learning Theory}, pages 203--208, 1999.
\bibitem[Freund and Schapire(1999)]{FreundS99perceptron}
Y.~Freund and R.~E. Schapire.
\newblock Large margin classification using the perceptron algorithm.
\newblock \emph{Machine Learning}, 37(3):277--296, 1999.
\bibitem[Helmbold and Warmuth(1995)]{HelmboldW95}
D.~P. Helmbold and M.~K. Warmuth.
\newblock On weak learning.
\newblock \emph{Journal of Computer and System Sciences}, 50:551--573, 1995.
\end{thebibliography}
\end{document}