%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Thompson Sampling is Asymptotically Optimal in General Environments
% Jan Leike, Tor Lattimore, Laurent Orseau, and Marcus Hutter
% Uncertainty in Artificial Intelligence, 2016
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\documentclass{article}
\usepackage{proceed2e} % UAI style sheet
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage[unicode]{hyperref}
\usepackage{enumerate}
\usepackage{amsmath}
\usepackage{times}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{xtab} % breaking the list of notation across coloumns
\usepackage{array} % for centering vertically in tables
\usepackage{tikz}
% Pseudo code
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\newcommand{\algorithmautorefname}{Algorithm}
\usepackage{mythm}
\usepackage{aixi}
\def\Ent{\mathrm{Ent}}
\def\one{1}
\def\eps{\varepsilon}
\def\Var{\mathrm{Var}}
\def\BayesExp{\mbox{BayesExp}}
% For referencing items in the assumption ass:gamma
\newcommand{\assref}[1]{\hyperref[#1]{\autoref*{ass:gamma}\ref*{#1}}}
% Diamonds at the end of examples
\usepackage{etoolbox}
\AtEndEnvironment{example}{
\renewcommand{\qedsymbol}{$\Diamond$}%
\qed
}
% Make section and subsection headings uppercase
\usepackage[explicit]{titlesec}
\titleformat{\section}{\large\bf\raggedright}{\thesection}{1em}{\MakeUppercase{#1}}
\titleformat{\subsection}{\normalsize\bf\raggedright}{\thesubsection}{1em}{\MakeUppercase{#1}}
% autoref Section and Subsection in uppercase
\AtBeginDocument{
\renewcommand{\sectionautorefname}{Section}
\renewcommand{\subsectionautorefname}{Section}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Meta Data
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Thompson Sampling is Asymptotically Optimal in General Environments}
\author{
{\bf Jan Leike} \\
Australian National University \\
\href{mailto:jan.leike@anu.edu.au}{jan.leike@anu.edu.au}
\And
{\bf Tor Lattimore} \\
University of Alberta \\
\href{mailto:tor.lattimore@gmail.com}{tor.lattimore@gmail.com}
\And
{\bf Laurent Orseau} \\
Google DeepMind \\
\href{mailto:lorseau@google.com}{lorseau@google.com}
\And
{\bf Marcus Hutter} \\
Australian National University \\
\href{mailto:marcus.hutter@anu.edu.au}{marcus.hutter@anu.edu.au}
}
% UAI limit: 9 pages + 1 page references + appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\maketitle
\begin{abstract}%
We discuss a variant of Thompson sampling for
nonparametric reinforcement learning in
a countable classes of general stochastic environments.
These environments can be non-Markov, non-ergodic, and partially observable.
We show that Thompson sampling learns the environment class
in the sense that
(1) asymptotically its value converges to the optimal value in mean and
(2) given a recoverability assumption regret is sublinear.
\end{abstract}
{\bf Keywords.}
General reinforcement learning,
Thompson sampling,
asymptotic optimality,
regret,
discounting,
recoverability,
AIXI.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
\label{sec:introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\paragraph{General Reinforcement Learning}
In reinforcement learning (RL)
an agent interacts with an unknown environment
with the goal of maximizing rewards.
Recently reinforcement learning has received a surge of interest,
triggered by its success in applications
such as simple video games~\cite{MKSRV+:2015deepQ}.
However, theory is lagging behind application
and most theoretical analyses has been done
in the bandit framework and
for Markov decision processes (MDPs).
These restricted environment classes fall short of
the full reinforcement learning problem and
theoretical results usually assume ergocity and
visiting every state infinitely often.
Needless to say, these assumptions are not satisfied
for any but the simplest applications.
Our goal is to lift these restrictions;
we consider \emph{general reinforcement learning},
a top-down approach to RL
with the aim to understand the fundamental underlying problems
in their generality.
Our approach to general RL is \emph{nonparametric}:
we only assume that
the true environment belongs to a given countable environment class.
%\paragraph{Optimality}
We are interested in agents that maximize rewards \emph{optimally}.
Since the agent does not know the true environment in advance,
it is not obvious what optimality should mean.
We discuss two different notions of optimality:
\emph{asymptotic optimality} and \emph{worst-case regret}.
%\paragraph{Asymptotic Optimality}
\emph{Asymptotic optimality}
requires that asymptotically the agent learns to act optimally,
i.e., that the discounted value of the agent's policy $\pi$
converges to the optimal discounted value,
$V^*_\mu - V^\pi_\mu \to 0$ for all environments $\mu$
from the environment class.
This convergence is impossible for deterministic policies
since the agent has to explore infinitely often and for long stretches of time,
but there are policies that converge almost surely in Cesàro average~\cite{LH:2011opt}.
Bayes-optimal agents are generally not asymptotically optimal%
~\cite{Orseau:2013}.
However, asymptotic optimality can be achieved through
an exploration component on top of a Bayes-optimal agent%
~\cite[Ch.~5]{Lattimore:2013} or through optimism~\cite{SH:2015opt}.
%\paragraph{PAC}
Asymptotic optimality in mean is essentially a weaker variant of
\emph{probably approximately correct} (PAC)
that comes without a concrete convergence rate:
for all $\varepsilon > 0$ and $\delta > 0$
the probability that our policy is $\varepsilon$-suboptimal
converges to zero (at an unknown rate).
Eventually this probability will be less than $\delta$.
Since our environment class can be very large and non-compact,
concrete PAC/convergence rates are likely impossible.
%\paragraph{Regret}
\emph{Regret} is how many expected rewards the agent forfeits by
not following the best informed policy.
Different problem classes have different regret rates,
depending on the structure and the difficulty of the problem class.
Multi-armed bandits provide a (problem-independent)
worst-case regret bound of $\Omega(\sqrt{KT})$
where $K$ is the number of arms~\cite{BCB:2012bandits}.
In Markov decision processes (MDPs)
the lower bound is $\Omega(\sqrt{DSAT})$
where $S$ is the number of states, $A$ the number of actions, and
$D$ the diameter of the MDP~\cite{AJO:2009MDPs}.
For a countable class of environments given by
state representation functions that map histories to MDP states,
a regret of $\tilde O(T^{2/3})$ is achievable
assuming the resulting MDP is weakly communicating~\cite{NMRO:2013}.
A problem class is considered \emph{learnable}
if there is an algorithm that has a sublinear regret guarantee.
%\paragraph{Relation to AIXI}
This paper continues a narrative that started
with definition of the Bayesian agent AIXI~\cite{Hutter:2000} and
the proof that it satisfies various optimality guarantees~\cite{Hutter:2002}.
Recently it was revealed that
these optimality notions are trivial or subjective~\cite{LH:2015priors}:
a Bayesian agent does not explore enough to lose the prior's bias,
and a particularly bad prior can make the agent conform to
any arbitrarily bad policy as long as this policy yields
some rewards.
These negative results put the Bayesian approach to (general) RL into question.
In this paper we remedy the situation
by showing that using Bayesian techniques an agent
can indeed be optimal in an objective sense.
%\paragraph{Contribution}
The agent we consider is known as \emph{Thompson sampling},
\emph{posterior sampling}, or
the \emph{Bayesian control rule}~\cite{Thompson:1933}.
It samples an environment $\rho$ from the posterior,
follows the $\rho$-optimal policy for one effective horizon
(a lookahead long enough to encompass most of the discount function's mass),
and then repeats.
We show that this agent's policy is asymptotically optimal in mean
(and, equivalently, in probability).
Furthermore, using a recoverability assumption on the environment,
and some (minor) assumptions on the discount function,
we prove that the worst-case regret is sublinear.
This is the first time convergence and regret bounds
of Thompson sampling have been shown under
such general conditions.
%\paragraph{Related Work on Thompson Sampling}
Thompson sampling was originally proposed by Thompson as a bandit algorithm~\cite{Thompson:1933}.
It is easy to implement and often achieves quite good results%
~\cite{CH:2011Thompson}.
In multi-armed bandits
it attains optimal regret~\cite{AG:2011Thompson,KKM:2012Thompson}.
Thompson sampling has also been considered for MDPs:
as model-free method relying on distributions over $Q$-functions
with convergence guarantee~\cite{DFR:1998Thompson}, and
as a model-based algorithm without theoretical analysis~\cite{Strens:2000}.
Bayesian and frequentist regret bounds have also been established~\cite{ORR:2013thompson,OVR:2014,GS:2015thompson}.
PAC guarantees have been established
for an optimistic variant of Thompson sampling for MDPs%
~\cite{ALLNW:2009Thompson}.
%\paragraph{Thompson Sampling for General Reinforcement Learning}
For general RL Thompson sampling
was first suggested in \cite{OB:2010Thompson} with resampling at every time step.
The authors prove that the action probabilities of Thompson sampling
converge to the action probability of the optimal policy almost surely,
but require a finite environment class and
two (arguably quite strong)
technical assumptions on the behavior of the posterior distribution
(akin to ergodicity)
and the similarity of environments in the class.
Our convergence results do not require these assumptions,
but we rely on an (unavoidable) recoverability assumption for our regret bound.
\autoref{app:notation} contains a list of notation and
\autoref{app:omitted-proofs} contains omitted proofs.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Preliminaries}
\label{sec:preliminaries}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\subsection{Strings}
%\paragraph{Strings.}
The set $\X^* := \bigcup_{n=0}^\infty \X^n$ is
the set of all finite strings over the alphabet $\X$ and
the set $\X^\infty$ is the set of all infinite strings
over the alphabet $\X$.
The empty string is denoted by $\epsilon$, not to be confused
with the small positive real number $\eps$.
Given a string $x \in \X^*$, we denote its length by $|x|$.
For a (finite or infinite) string $x$ of length $\geq k$,
we denote with $x_{1:k}$ the first $k$ characters of $x$,
and with $x_{ 0$ and $V^\pi_\nu(\ae_{ 0
\;\Longrightarrow\;
a_t \in \argmax_a V^*_\mu(\ae_{ \varepsilon ] \to 0$ as $t \to \infty$
for all $\varepsilon > 0$
because the random variables $X_t$ are nonnegative and bounded.
However, this does not imply almost sure convergence
(see \autoref{ssec:sao}).
Define the \emph{Bayes-expected total variation distance}
\[
F^\pi_m(\ae_{ 0$ and
let $\eps_t > 0$ denote the sequence
used to define $\pi_T$ in \autoref{alg:Thompson-sampling}.
We assume that $t$ is large enough such that
$\eps_k \leq \beta$ for all $k \geq t$ and that
$\delta$ is small enough such that $w(\mu \mid \ae_{ 4\delta$ for all $t$,
which holds since $w(\mu \mid \ae_{ 1 - \delta$
and $w(\nu \mid \ae_{ F^{\pi_T}_\infty(\ae_{ 4\delta$ by assumption.
It remains to show that with high probability the value $V^{\pi^*_\rho}_\mu$
of the sample $\rho$'s optimal policy $\pi^*_\rho$ is sufficiently close to
the $\mu$-optimal value $V^*_\mu$.
The worst case is that we draw the worst sample from $\M' \cap \M''$
twice in a row.
From now on, let $\rho$ denote the sample environment we draw at time step $t_0$,
and let $t$ denote some time step between
$t_0$ and $t_1 := t_0 + H_{t_0}(\eps_{t_0})$
(before the next resampling).
With probability $w(\nu' \mid \ae_{] (s0) to[loop above] node[left] {$\beta, \frac{1}{2}$} (s0);
\draw[->] (s0) to[bend right] node[above] {$\alpha, 0$} (s1);
\draw[->] (s1) to[bend right] node[below] {$\beta, 0$} (s0);
\draw[->] (s1) to node[below left] {$\alpha, 0$} (s2);
\draw[->] (s2) to node[right] {$\ast, 0$} (s0);
\end{tikzpicture} & \hspace{-4mm}
\begin{tikzpicture}
\node[circle, draw, minimum height=2em] (s0) at (0, 0) {$s_0$};
\node[circle, draw, minimum height=2em] (s1) at (-1.5, -1) {$s_1$};
\node[circle, draw, minimum height=2em] (s2) at (0, -2) {$s_2$};
\node[circle, draw, minimum height=2em] (s3) at (2, 0) {$s_3$};
\node[circle, draw, minimum height=2em] (s4) at (2, -2) {$s_4$};
\draw[->] (s0) to[loop above] node[left] {$\beta, \frac{1}{2}$} (s0);
\draw[->] (s0) to[bend right] node[above left] {$t < k: \alpha, 0$} (s1);
\draw[->] (s1) to[bend right] node[below] {$\beta, 0$} (s0);
\draw[->] (s1) to node[below left] {$\alpha, 0$} (s2);
\draw[->] (s2) to node[right] {$\ast, 0$} (s0);
\draw[->] (s0) to[bend left] node[above] {$t \geq k: \alpha, 0$} (s3);
\draw[->] (s3) to node[left] {$\alpha, 0$} (s4);
\draw[->] (s3) to[bend left] node[below] {$\beta, 0$} (s0);
\draw[->] (s4) to[loop below] node[left] {$\alpha, 1$} (s4);
\draw[->] (s4) to node[below] {$\beta, 0$} (s2);
\end{tikzpicture} \\
$\nu_\infty$ & $\nu_k$
\end{tabular}
\end{center}
Environment $\nu_k$ works just like environment $\nu_\infty$
except that after time step $k$, the path to state $s_3$ gets unlocked and
the optimal policy is to take action $\alpha$ twice from state $s_0$.
The class $\M$ is a class of deterministic weakly communicating MDPs
(but as an MDP $\nu_k$ has more than 5 states).
The optimal policy in environment $\nu_\infty$ is to always take action $\beta$,
the optimal policy for environment $\nu_k$ is
to take action $\beta$ for $t < k$ and then
take action $\beta$ in state $s_1$ and action $\alpha$ otherwise.
Suppose the policy $\pi_T$ is acting in environment $\nu_\infty$.
Since it is asymptotically optimal in the class $\M$,
it has to take actions $\alpha\alpha$ from $s_0$ infinitely often:
for $t < k$ environment $\nu_k$ is indistinguishable from $\nu_\infty$,
so the posterior for $\nu_k$ is larger or equal to the prior.
Hence there is always a constant chance of sampling $\nu_k$
until taking actions $\alpha\alpha$,
at which point all environments $\nu_k$ for $k \leq t$ become falsified.
If the policy $\pi_T$ decides to explore and take the first action $\alpha$,
it will be in state $s_1$.
Let $\ae_{ 0$.
This happens infinitely often with probability one and
thus we cannot get almost sure convergence.
\end{example}
We expect that strong asymptotic optimality can be achieved
with Thompson sampling by resampling at every time step
(with strong assumptions on the discount function).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Regret}
\label{sec:regret}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Setup}
\label{ssec:regret-setup}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In general environments classes
worst-case regret is linear
because the agent can get caught in a trap and be unable to recover%
~\cite[Sec.~5.3.2]{Hutter:2005}.
To achieve sublinear regret we need to ensure that
the agent can recover from mistakes.
Formally, we make the following assumption.
%-------------------------------%
\begin{definition}[Recoverability]
\label{def:recoverability}
%-------------------------------%
An environment $\nu$ satisfies the \emph{recoverability assumption} iff
\[
\sup_\pi \left|
\EE^{\pi^*_\nu}_\nu[ V^*_\nu(\ae_{ 0$ for all $t$,
\item \label{itm:gamma(b)}
$\gamma_t$ is monotone decreasing in $t$, and
\item \label{itm:gamma(c)}
$H_t(\eps) \in o(t)$ for all $\eps > 0$.
\end{enumerate}
\end{assumption}
This assumption demands that the discount function is somewhat well-behaved:
the function has no oscillations, does not become $0$, and
the horizon is not growing too fast.
\autoref{ass:gamma} is satisfied by geometric discounting:
$\gamma_t := \gamma^t > 0$~(\ref{itm:gamma(a)})
for some fixed constant $\gamma \in (0, 1)$ is monotone decreasing~(\ref{itm:gamma(b)}),
$\Gamma_t = \gamma^t / (1 - \gamma)$, and
$H_t(\eps) = \lceil \log_\gamma \eps \rceil \in o(t)$~(\ref{itm:gamma(c)}).
The problem with geometric discounting is that
it makes the recoverability assumption very strong:
since the horizon is not growing, the environment has to enable
\emph{faster recovery} as time progresses;
in this case weakly communicating partially observable MDPs
are \emph{not} recoverable.
A choice with $H_t(\eps) \to \infty$ that satisfies \autoref{ass:gamma} is
$\gamma_t := e^{-\sqrt{t}} / \sqrt{t}$~\cite[Sec.~2.3.1]{Lattimore:2013}.
For this discount function
$\Gamma_t \approx 2e^{-\sqrt{t}}$,
$H_t(\eps) \approx -\sqrt{t}\log \eps + (\log \eps)^2 \in o(t)$, and thus
$H_t(\eps) \to \infty$ as $t \to \infty$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Sublinear Regret}
\label{ssec:sublinear-regret}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
This subsection is dedicated to the following theorem.
%-------------------------------%
\begin{theorem}[Sublinear Regret]
\label{thm:aoim-implies-sublinear-regret}
%-------------------------------%
If the discount function $\gamma$ satisfies \autoref{ass:gamma},
the environment $\mu \in \M$ satisfies the recoverability assumption, and
$\pi$ is asymptotically optimal in mean, i.e.,
\[
\EE_\mu^\pi \big[ V^*_\mu(\ae_{ 0$ and
assume the discount function $\gamma$ satisfies \autoref{ass:gamma}.
Let $(d_t)_{t \in \mathbb{N}}$ be a sequence of numbers with
$|d_t| \leq 1$ for all $t$.
If there is a time step $t_0$ with
\begin{equation}\label{eq:ao}
\frac{1}{\Gamma_t} \sum_{k=t}^\infty \gamma_k d_k < \eps
\quad \forall t \geq t_0
\end{equation}
then
\[
\sum_{t=1}^m d_t
\leq t_0 + \varepsilon(m - t_0 + 1)
+ \frac{1 + \varepsilon}{1 - \varepsilon} H_m(\varepsilon)
\]
\end{lemma}
\begin{proof}
This proof essentially follows
the proof of \cite[Thm.~17]{Hutter:2006discounting};
see \autoref{app:omitted-proofs}.
\end{proof}
\begin{proof}[Proof of \autoref{thm:aoim-implies-sublinear-regret}]
Let $(\pi_m)_{m \in \mathbb{N}}$ denote any sequence of policies,
such as a sequence of policies that
attain the supremum in the definition of regret.
We want to show that
\[
\EE_\mu^{\pi_m} \left[ \sum_{t=1}^m r_t \right]
- \EE_\mu^\pi \left[ \sum_{t=1}^m r_t \right]
\in o(m).
\]
For
\begin{equation}\label{eq:def-d_k}
d_k^{(m)} := \EE_\mu^{\pi_m} [ r_k ] - \EE_\mu^\pi [ r_k ]
\end{equation}
we have $-1 \leq d_k^{(m)} \leq 1$
since we assumed rewards to be bounded between $0$ and $1$.
Because the environment $\mu$ satisfies the recoverability assumption
we have
\begin{align*}
&\left|
\EE^{\pi^*_\mu}_\mu[ V^*_\mu(\ae_{ 0$ and choose $t_0$ independent of $m$ and large enough such that
$\sup_m \sum_{k=t}^\infty \gamma_k d_k^{(m)} / \Gamma_t < \eps$
for all $t \geq t_0$.
Now we let $m \in \mathbb{N}$ be given and
apply \autoref{lem:value-and-regret} to get
\begin{align*}
\frac{R_m(\pi, \mu)}{m}
&= \frac{\sum_{k=1}^m d_k^{(m)}}{m} \\
&\leq \frac{t_0 + \varepsilon(m - t_0 + 1) + \frac{1 + \varepsilon}{1 - \varepsilon} H_m(\varepsilon)}{m}.
\end{align*}
Since $H_t(\eps) \in o(t)$ according to \assref{itm:gamma(c)}
we get $\limsup_{m \to \infty} R_m(\pi, \mu) / m \leq 0$.
\end{proof}
\begin{example}[Converse of \autoref{thm:aoim-implies-sublinear-regret} is False]
\label{ex:sublinear-regret-does-not-imply-ao}
Let $\mu$ be a two-armed Bernoulli bandit with means $0$ and $1$
and suppose we are using geometric discounting
with discount factor $\gamma \in [0, 1)$.
This environment is recoverable.
If our policy $\pi$ pulls the suboptimal arm
exactly on time steps $1, 2, 4, 8, 16, \ldots$,
regret will be logarithmic.
However, on time steps $t = 2^n$ for $n \in \mathbb{N}$
the value difference $V^*_\mu - V^\pi_\mu$
is deterministically at least $1 - \gamma > 0$.
\end{example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Implications}
\label{ssec:regret-implications}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We get the following immediate consequence.
%-------------------------------%
\begin{corollary}[Sublinear Regret for the Optimal Discounted Policy]
\label{cor:discounted-and-undiscounted-regret}
%-------------------------------%
If the discount function $\gamma$ satisfies \autoref{ass:gamma} and
the environment $\mu$ satisfies the recoverability assumption,
then
$
R_m(\pi^*_\mu, \mu)
\in o(m)
$.
\end{corollary}
\begin{proof}
From \autoref{thm:aoim-implies-sublinear-regret}
since the policy $\pi^*_\mu$ is (trivially) asymptotically optimal in $\mu$.
\end{proof}
If the environment does not satisfy the recoverability assumption,
regret may be linear \emph{even on the optimal policy}:
the optimal policy maximizes discounted rewards and
this short-sightedness might incur a tradeoff
that leads to linear regret later on
if the environment does not allow recovery.
%-------------------------------%
\begin{corollary}[Sublinear Regret for Thompson Sampling]
\label{cor:thompson-sampling-regret}
%-------------------------------%
If the discount function $\gamma$ satisfies \autoref{ass:gamma} and
the environment $\mu \in \M$ satisfies the recoverability assumption,
then $R_m(\pi_T, \mu) \in o(m)$
for the Thompson sampling policy $\pi_T$.
\end{corollary}
\begin{proof}
From \autoref{thm:taxi-ao} and \autoref{thm:aoim-implies-sublinear-regret}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Discussion}
\label{sec:discussion}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\paragraph{Summary}
In this paper we introduced a reinforcement learning policy $\pi_T$ based on Thompson sampling
for general countable environment classes~(\autoref{alg:Thompson-sampling}).
We proved two asymptotic statements about this policy.
\autoref{thm:taxi-ao} states that $\pi_T$ is asymptotically optimal in mean:
the value of $\pi_T$ in the true environment
converges to the optimal value.
\autoref{cor:thompson-sampling-regret} states that
the regret of $\pi_T$ is sublinear:
the difference of the expected average rewards between $\pi_T$
and the best informed policy converges to $0$.
Both statements come without a concrete convergence rate because of the
weak assumptions we made on the environment class.
%\paragraph{Problems with Asymptotic Optimality}
Asymptotic optimality has to be taken with a grain of salt.
It provides no incentive to the agent to avoid traps in the environment.
Once the agent gets caught in a trap, all actions are equally bad and thus
optimal: asymptotic optimality has been achieved.
Even worse, an asymptotically optimal agent has to explore all the traps
because they might contain hidden treasure.
Overall, there is a dichotomy between the asymptotic nature of
asymptotic optimality and the use of discounting to prioritize the present over the future.
Ideally, we would want to give finite guarantees instead,
but without additional assumptions this is likely impossible in this general setting.
Our regret bound could be a step in the right direction,
even though itself asymptotic in nature.
%\paragraph{What does Convergence Mean?}
For Bayesians asymptotic optimality means that
the posterior distribution $w(\,\cdot \mid \ae_{ 0$ for all $t$
and hence $\Gamma_t > 0$ for all $t$.
By \assref{itm:gamma(b)}
have that $\gamma$ is monotone decreasing,
so we get for all $n \in \mathbb{N}$
\[
\Gamma_t
= \sum_{k=t}^\infty \gamma_k
\leq \sum_{k=t}^{\mathclap{t+n-1}} \gamma_t
+ \sum_{\mathclap{k=t+n}}^\infty \gamma_k
= n\gamma_t + \Gamma_{t+n}.
\]
And with $n := H_t(\eps)$ this yields
\begin{equation}\label{eq:t-independent-bound}
\frac{\gamma_t H_t(\eps)}{\Gamma_t}
\geq 1 - \frac{\Gamma_{t+H_t(\eps)}}{\Gamma_t}
\geq 1 - \eps
> 0.
\end{equation}
In particular, this bound holds for all $t$ and $\eps > 0$.
Next, we define a series of nonnegative weights $(b_t)_{t \geq 1}$ such that
\[
\sum_{t=t_0}^m d_k
= \sum_{t=t_0}^m \frac{b_t}{\Gamma_t} \sum_{k=t}^m \gamma_k d_k.
\]
This yields the constraints
\[
\sum_{k=t_0}^t \frac{b_k}{\Gamma_k} \gamma_t = 1 \quad \forall t \geq t_0.
\]
The solution to these constraints is
\begin{equation}\label{eq:b_t-solution}
b_{t_0} = \frac{\Gamma_{t_0}}{\gamma_{t_0}},
\text{ and }
b_t = \frac{\Gamma_t}{\gamma_t} - \frac{\Gamma_t}{\gamma_{t-1}}
\text{ for $t > t_0$}.
\end{equation}
Thus we get
\begin{align*}
\sum_{t=t_0}^m b_t
&= \frac{\Gamma_{t_0}}{\gamma_{t_0}}
+ \sum_{t=t_0+1}^m \left(
\frac{\Gamma_t}{\gamma_t} - \frac{\Gamma_t}{\gamma_{t-1}}
\right) \\
&= \frac{\Gamma_{m+1}}{\gamma_m}
+ \sum_{t=t_0}^m \left(
\frac{\Gamma_t}{\gamma_t} - \frac{\Gamma_{t+1}}{\gamma_t}
\right) \\
&= \frac{\Gamma_{m+1}}{\gamma_m} + m - t_0 + 1 \\
&\leq \frac{H_m(\varepsilon)}{1 - \varepsilon} + m - t_0 + 1
\end{align*}
for all $\varepsilon > 0$ according to \eqref{eq:t-independent-bound}.
Finally,
\begin{align*}
\sum_{t=1}^m d_t
&\leq \sum_{t=1}^{t_0} d_t
+ \sum_{t=t_0}^m \frac{b_t}{\Gamma_t} \sum_{k=t}^m \gamma_k d_k \\
&\leq t_0
+ \sum_{t=t_0}^m \frac{b_t}{\Gamma_t} \sum_{k=t}^\infty \gamma_k d_k
- \sum_{t=t_0}^m \frac{b_t}{\Gamma_t} \sum_{k=m+1}^\infty \gamma_k d_k \\
\intertext{
and using the assumption \eqref{eq:ao} and $d_t \geq -1$,
}
&< t_0
+ \sum_{t=t_0}^m b_t \varepsilon
+ \sum_{t=t_0}^m \frac{b_t \Gamma_{m+1}}{\Gamma_t} \\
&\leq t_0 + \frac{\varepsilon H_m(\varepsilon)}{1 - \varepsilon}
+ \varepsilon (m - t_0 + 1)
+ \sum_{t=t_0}^m \frac{b_t \Gamma_{m+1}}{\Gamma_t}
\end{align*}
For the latter term we substitute \eqref{eq:b_t-solution} to get
\begin{align*}
\sum_{t=t_0}^m \frac{b_t \Gamma_{m+1}}{\Gamma_t}
&= \frac{\Gamma_{m+1}}{\gamma_{t_0}} + \sum_{t=t_0 + 1}^m
\left( \frac{\Gamma_{m+1}}{\gamma_t} - \frac{\Gamma_{m+1}}{\gamma_{t-1}} \right) \\
&= \frac{\Gamma_{m+1}}{\gamma_m}
\leq \frac{H_m(\eps)}{1 - \eps}
\end{align*}
with \eqref{eq:t-independent-bound}.
\end{proof}
\end{document}