\documentclass[12pt]{article}
\setlength{\topmargin}{1in}
\setlength{\textheight}{9in}
\setlength{\textwidth}{6in}
\setlength{\oddsidemargin}{1.25in}
\setlength{\evensidemargin}{1.25in}
\usepackage{amsthm}
\usepackage[sumlimits]{amsmath}
\usepackage{amssymb}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage[all]{xy}
\usepackage{wasysym}
\usepackage{fullpage}
\usepackage{graphicx}
\usepackage{url}
\usepackage{natbib}
\usepackage{authblk}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Space Saving Tricks!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\sloppy\lineskip=0pt
\usepackage{times}
\usepackage{multitoc}
%\usepackage{mathtime}
\usepackage[small]{titlesec} % \usepackage[small,compact]{titlesec}
\renewcommand{\baselinestretch}{0.98}
%\addtolength{\textfloatsep}{-5mm}
\setlength{\baselineskip}{0mm}
\setlength{\parindent}{1em}
%\usepackage{mathptmx}
\setlength{\parskip}{0cm}
\titlespacing{\section}{0pt}{2ex}{1ex}
\titlespacing{\subsection}{0pt}{1.5ex}{1ex}
\titlespacing{\subsubsection}{0pt}{1.5ex}{0.5ex}
\linespread{0.9475}
\titlespacing{\paragraph}{%
0pt}{% left margin
0.5\baselineskip}{% space before (vertical)
0.5em}%
\let\oldnormalsize\normalsize
\def\normalsize{\oldnormalsize%
\abovedisplayskip5pt plus2pt minus 2pt%
\belowdisplayskip5pt plus2pt minus 2pt}
\setlength{\tabcolsep}{2.65pt}
%-------------------------------%
% Macro-Definitions %
%-------------------------------%
%\newif\ifdcc\dcctrue % DCC submission
\newif\ifdcc\dccfalse % Tech report
\makeatletter
\newtheorem*{rep@theorem}{\rep@title}
\newcommand{\newreptheorem}[2]{%
\newenvironment{rep#1}[1]{%
\def\rep@title{#2 \ref{##1}}%
\begin{rep@theorem}}%
{\end{rep@theorem}}}
\makeatother
\newtheorem{defn}{Definition}
\newtheorem{lem}{Lemma}
\newreptheorem{lemma}{Lemma}
\newtheorem{thm}{Theorem}
\newreptheorem{theorem}{Theorem}
\newtheorem{rem}{Remark}
\newtheorem{notation}{Notation}
\newtheorem{proposition}{Proposition}
\newtheorem{conjecture}{Conjecture}
\newtheorem{corollary}{Corollary}
\theoremstyle{definition}\newtheorem{example}{Example}
\def\cdbar{|}
\def\cbar{\;|\;}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cts}{\text{\sc cts}}
\newcommand{\ctw}{\text{\sc ctw}}
\def\ss#1{\begin{scriptsize}#1\end{scriptsize}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% T i t l e - P a g e %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\fontsize{11.5}{14}\rm
\def\name{}
\def\email{\small\hfill\sc}
\def\addr#1{\small\it #1}
\title{
\bf\LARGE\hrule height5pt \vskip 4mm
Context Tree Switching
\vskip 4mm \hrule height2pt
}
\renewcommand\Authsep{~~~~}
\renewcommand\Authand{~~~~}
\renewcommand\Authands{~~~~}
\setlength{\affilsep}{0.5em}
\author[$\dagger$]{Joel Veness}
\author[$\ddagger$]{Kee Siong Ng}
\author[$\ddagger$]{Marcus Hutter}
\author[$\dagger$]{Michael Bowling}
\affil[$\dagger$]{\small University of Alberta, Edmonton, Canada}
\affil[$\ddagger$]{\small Australian National University, Canberra, Australia}
\date{}
\maketitle
\vspace{-2em}
\begin{abstract}
This paper describes the Context Tree Switching technique, a modification of Context Tree Weighting for the prediction of binary, stationary, $n$-Markov sources.
By modifying Context Tree Weighting's recursive weighting scheme, it is possible to mix over a strictly larger class of models without increasing the asymptotic time or space complexity of the original algorithm.
We prove that this generalization preserves the desirable theoretical properties of Context Tree Weighting on stationary $n$-Markov sources, and show empirically that this new technique leads to
%small but
consistent improvements over Context Tree Weighting as measured on the Calgary Corpus.
\end{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}\label{sec:intro}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Context Tree Weighting \citep{ctw95} is a well-known, universal lossless compression algorithm for binary, stationary, $n$-Markov sources.
It provides a striking example of a technique that works well both in theory and practice.
Similar to Prediction by Partial Matching \citep{Cleary84datacompression}, Context Tree Weighting (CTW) uses a context tree data structure to store statistics about the current data source.
These statistics are recursively combined by \emph{weighting}, which leads to an elegant algorithm whose worst-case performance can be characterized by an analytic regret bound that holds for \emph{any} finite length data sequence, as well as asymptotically achieving (in expectation) the lower bound of \cite{rissanen84} for the class of binary, stationary $n$-Markov sources.
This paper explores an alternative recursive weighting procedure for CTW, which weights over a strictly larger class of models without increasing the asymptotic time or space complexity of the original algorithm.
We call this new procedure the Context Tree Switching (CTS) algorithm, which we investigate both theoretically and empirically.
%%%%%%%%%%%%%%%%%%%%%
\section{Background}
%%%%%%%%%%%%%%%%%%%%%
%------------------------------------%
%\subsection{Data Generating Sources}
%------------------------------------%
We begin with some notation and definitions for binary data generating sources.
%
%\subsubsection{Binary Strings}
Our binary alphabet is denoted by $\cX := \{ 0, 1 \}$.
A binary string $x_1x_2 \ldots x_n \in \cX^n$ of length $n$ is denoted by $x_{1:n}$.
The prefix $x_{1:j}$ of $x_{1:n}$, $j\leq n$, is denoted by $x_{\leq j}$ or $x_{< j+1}$.
The empty string is denoted by $\epsilon$.
The concatenation of two strings $s$ and $r$ is denoted by $sr$.
If $\cS$ is a set of strings and $r \in \{ 0,1 \}$, then $\cS \times r := \{ sr : s \in \cS \}$.
We will also use $l(s)$ to denote the length of a string $s$.
\subsection{Probabilistic Binary Sources}
We define a probabilistic data generating source $\rho$ to be a set of functions $\rho_n : \cX^n \to [0,1]$, for $n\in\mathbb{N}$, satisfying the constraint that $\rho_n(x_{1:n}) = \sum_{y\in\cX} \rho_{n+1}(x_{1:n}y)$ for all $x_{1:n} \in \cX^n$, with base case $\rho_0(\epsilon) = 1$.
As the meaning is always clear from the argument to $\rho$, we drop the subscripts on $\rho$ from here onwards.
Under this definition, the conditional probability of a symbol $x_n$ given previous data $x_{ 0$, with the familiar chain rule $\rho(x_{1:n}) = \prod_{i=1}^n \rho(x_i | x_{* 0$ such that $\sum_{\rho \in \cM} w^\rho_0 = 1$.
Notice that if some model $\rho^* \in \cM$ is a good source coding distribution for a data sequence $x_{1:n}$, then provided $n$ is sufficiently large, $\xi$ will be a good coding distribution, since
\begin{equation}\label{eq:weighting_bound}
-\log_2 \xi(x_{1:n}) = -\log_2 \sum_{\rho \in \cM} w_0^\rho \rho(x_{1:n}) \leq -\log_2 w_0^{\rho} \rho(x_{1:n}) = -\log_2 w_0^{\rho} -\log_2 \rho(x_{1:n})
\end{equation}
holds for all $\rho \in \cM$.
Therefore, we would only need at most an extra $-\log_2 w_0^{\rho^*}$ bits, an amount independent of $n$, to transmit $x_{1:n}$ using $\xi$ instead of the best model $\rho^*$ in $\cM$.
An important special case of this result is when $|\cM|=2$ and $w_0^{\rho_1} = w_0^{\rho_2} = \tfrac{1}{2}$, when only $1$ extra bit is required.
\subsubsection{Switching}\label{sec:switching}
While weighting provides an easy way to combine models, as an ensemble method it is somewhat limited in that it only guarantees performance in terms of the best \emph{single} model in $\cM$.
It is easy to imagine situations where this would be insufficient in practice.
Instead, one could consider weighting over \emph{sequences} of models chosen from a fixed base class $\cM$.
Variants of this fundamental idea have been considered by authors from quite different research communities.
Within the data compression community, there is the Switching Method and the Snake algorithm \citep{Volf98switchingbetween}.
Similar approaches were also considered in the online learning community, in particular the Fixed-Share \citep{herbster1998} algorithm for tracking the best expert over time.
From the machine learning community, related ideas were investigated in the context of Bayesian model selection, giving rise to the Switch Distribution \citep{erven2007}.
The setup we use draws most heavily on \citep{erven2007}, though there appears to be considerable overlap amongst the approaches.
%The reasons why we chose this particular definition will become clear once we discuss its theoretical and computational properties.
\begin{defn}
\label{def:switch_distribution}
Given a finite model class $\cM = \{\rho_1, \dots, \rho_N\}$ with $N > 1$, for all $n \in \mathbb{N}$, for all $x_{1:n} \in \cX^n$, the Switch Distribution with respect to $\cM$ is defined as
\begin{equation}\label{eq:switch_distribution}
\tau_{\mathbf{\alpha}}(x_{1:n}) := \sum\limits_{i_{1:n} \in \cI_n(\cM)} w(i_{1:n}) \prod_{k=1}^n \rho_{i_k}(x_k \cdbar x_{ 1$
\REQUIRE A weight vector $(w_1, \dots, w_N) \in \mathbb{R}^N$, with $w_i = \tfrac{1}{N}$ for $1 \leq i \leq N$
\REQUIRE A sequence of switching rates $\{ \alpha_2, \alpha_3, \dots, \alpha_n \}$
\medskip
\STATE $r \leftarrow 1$
\FOR{$i=1$ to $n$}
\STATE $r \leftarrow \sum\limits_{j=1}^{N} w_j \rho_j(x_i \cdbar x_{** 0}\left( \tfrac{1}{2} \log_2(a_s + b_s) + 1 \right)
\notag
\\= |\cS| \sum_{s\in\cS} \frac{\gamma(a_s + b_s)}{|\cS|} \leq
|\cS| \gamma \left( \sum\limits_{s\in\cS} \frac{a_s+b_s}{|\cS|} \right) =
|\cS| \gamma(\tfrac{n}{|\cS|}).
\end{eqnarray}
The first inequality follows from Equation \ref{eq:kt_parameter_redun}, with the second inequality following from an application of Jensen's Inequality.
\fi
%-------------------------------------------------------------------------------%
\subsubsection{Weighting Over Prediction Suffix Trees}
%-------------------------------------------------------------------------------%
The Context Tree Weighting algorithm combines the data partitioning properties of a PST, a carefully chosen weighting scheme, and the distributive law to efficiently weight over the space of PST structures of bounded depth $D\in\mathbb{N}$.
We now introduce some notation to make this process explicit.
\begin{defn}
The set of all complete and proper suffix sets of bounded depth $D$ is denoted by $\cC_D$, and is given by the recurrence
\begin{equation}
\cC_D := \left\{
\begin{array}{lr}
\bigl\{ \{ \epsilon \} \bigr\} \text{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~if~} D = 0 \\
\bigl\{ \{ \epsilon \} \bigr\} \cup \left\{ \cS_1 \times 1 \cup \cS_2 \times 0 : \cS_1, \cS_2 \in \cC_{D-1} \right\} \text{~if~} D > 0. \\
\end{array}
\right.
\end{equation}
\end{defn}
Notice that $|\cC_D|$ grows roughly double exponentially in $D$.
For example, $|\cC_0| = 1, |\cC_1| = 2, |\cC_2| = 5, |\cC_3| = 26, |\cC_4| = 677, |\cC_5| = 458330$, which means that some ingenuity is required to weight over all $\cC_D$ for any reasonably sized $D$.
This comes in the form of a weighting scheme that is derived from a natural prefix coding of the structure of a PST.
It works as follows: given a PST structure with depth no more than $D$, a pre-order traversal of the tree is performed.
Each time an internal node is encountered for the first time, a 1 is written down.
Each time a leaf node is encountered, a 0 is written if the depth of the leaf node is less than $D$, otherwise nothing is written.
For example, if $D = 3$, the code for the model shown in Figure \ref{fig:pst} is 10100; if $D = 2$, the code for the same model is 101.
We now define the cost $\Gamma_{D}(\cS)$ of a suffix set $\cS$ to be the length of its code.
One can show that $\sum_{\cS \in C_D} 2^{-\Gamma_{D}(\cS)} = 1$; i.e.\ the prefix code is complete.
Thus we can now define
\begin{equation}\label{eq:ctw_def}
\ctw_D(x_{1:n}) := \sum_{\cS \in \cC_{D}} 2^{-\Gamma_{D}(\cS)} \prod\limits_{s\in \cS} \xi_{KT}(x^s_{1:n}).
\end{equation}
%We remark that the above is another way of describing the coding scheme in \cite{ctw95}.
Notice also that this choice of weighting imposes an Ockham-like penalty on large PST structures.
\paragraph{Recursive Decomposition}
If we let $\cK_D := \{0,1\}^*$ denote the set of all possible contexts for class $\cC_D$,
$x^c_{1:n}$ denote the subsequence of data $x_{1:n}$ that matches context $c \in \cK_D$, and define $\ctw^\epsilon_D(x_{1:n}) := \ctw_D(x_{1:n})$, we can decompose Equation \ref{eq:ctw_def} into (see \citep{ctw95})
%\begin{eqnarray}
\begin{equation}
%\text{\sc ctw}_D(x^c_{1:n}) &=& \sum_{\rho \in \cC_{D}} 2^{-\Gamma_{D}(\rho)} \psi_{\rho}(x^{c}_{1:n})
\ctw^c_D(x_{1:n})
%=\sum_{\cS \in \cC_{D}} 2^{-\Gamma_{D}(\cS)} \prod\limits_{s\in \cS} \xi_{KT}((x^c_{1:n})^s)
%\notag \\
%&=& \tfrac{1}{2}\xi_{KT}(x^c_{1:n}) + \tfrac{1}{2}\sum\limits_{\rho \in \cC_{D-1}} \sum\limits_{\nu \in \cC_{D-1}} 2^{-\Gamma_{D-1}(\rho) -\Gamma_{D-1}(\nu)} \psi_\rho(x^{c0}_{1:n}) \psi_\nu(x^{c1}_{1:n}) \notag \\
%&=& \tfrac{1}{2}\xi_{KT}(x^c_{1:n}) + \tfrac{1}{2}\left( \sum\limits_{\rho \in \cC_{D-1}} 2^{-\Gamma_{D-1}(\rho)} \psi_\rho(x^{c0}_{1:n}) \right) \left( \sum\limits_{\nu \in \cC_{D-1}} 2^{-\Gamma_{D-1}(\rho)} \psi_\nu(x^{c1}_{1:n}) \right) \notag\\
%&=&
= \tfrac{1}{2} \, \xi_{KT}(x^c_{1:n}) + \tfrac{1}{2} \, \text{\sc ctw}^{0c}_{D-1}(x_{1:n}) \, \text{\sc ctw}^{1c}_{D-1}(x_{1:n})\label{eq:ctw_rec},
\end{equation}
%\end{eqnarray}
for $D > 0$.
In the base case of a single node (i.e. weighting over $\cC_0$) we have $\text{\sc ctw}^c_0(x_{1:n}) = \xi_{KT}(x^c_{1:n})$.
\paragraph{Computational Properties}
The efficiency of CTW derives from Equation \ref{eq:ctw_rec}, since the double mixture can be maintained incrementally by applying it $D+1$ times to process each new symbol.
Therefore, using the Context Tree Weighting algorithm, only $O(n D)$ time is required to compute $\text{\sc ctw}_D(x_{1:n})$.
Furthermore, only $O(D)$ work is required to compute $\text{\sc ctw}_D(x_{1:n+1})$ from $\text{\sc ctw}_D(x_{1:n})$.
\paragraph{Theoretical Properties}
Using Equation \ref{eq:weighting_bound}, the model redundancy can be bounded by
\begin{equation*}
-\log_2 \text{\sc ctw}_D(x_{1:n}) = -\log_2 \left( \sum_{\cS \in \cC_{D}} 2^{-\Gamma_{D}(\cS)} \prod\limits_{s\in \cS} \xi_{KT}(x^s_{1:n}) \right) < \Gamma_{D}(\cS) -\log_2 \prod\limits_{s\in \cS} \xi_{KT}(x^s_{1:n}).
\end{equation*}
This can be combined with the parameter redundancy specified by Equation \ref{eq:pst_parameter_redundancy} to give
\begin{equation}\label{eq:ctw_bound}
-\log_2 \text{\sc ctw}_D(x_{1:n}) < \Gamma_{D}(\cS) + |\cS| \gamma \left( \tfrac{n}{|\cS|} \right) - \log_2 \Pr(x_{1:n} \,|\, \cS, \Theta_{\cS})
\end{equation}
for any $\cS \in \cC_D$.
Finally, combining Equation \ref{eq:ctw_bound} with the coding redundancy bound given in Equation \ref{eq:coding_redundancy} leads to the main theoretical result for CTW.
\begin{thm}[\cite{ctw95}]\label{thm:ctw_redundancy_thm}
For all $n\in\mathbb{N}$, given a data sequence $x_{1:n} \in \cX^n$ generated by a binary PST source $(\cS, \Theta_{\cS})$ with $\cS \in \cC_D$ and $\Theta_{\cS} := \{ \theta_s : \theta_s \in [0,1] \}_{s \in \cS}$, the redundancy of CTW using context depth $D \in \mathbb{N}$ is upper bounded by
$\Gamma_D(\cS) + |\cS| \gamma \left( \tfrac{n}{|\cS|} \right) + 2$.
\end{thm}
%-----------------------------------------%
\section{Context Tree Switching}
%-----------------------------------------%
Context Tree Switching is a natural combination of CTW and switching.
To see this, first note that Equation \ref{eq:ctw_rec} allows us to interpret CTW as a recursive application of the weighting method of Section \ref{sec:weighting}.
Recalling Theorem \ref{thm:combo}, we know that a careful application of switching essentially preserves the good properties of weighting, and may even work better provided some rarely changing sequence of models predicts the data well.
Using a class of PST models, it seems reasonable to suspect
%(see the discussion of the ``Catch Up Phenonmonen'' in \ref{} for more motivation)
that the best model may change over time; for example, a large PST model might work well given sufficient data, but before then a smaller model might be more accurate due to its smaller parameter redundancy.
%This motivated us to replace the recursive weighting specified by Equation \ref{eq:ctw_rec} for CTW with the switching technique.
The main insight behind CTS is to weight over all \emph{sequences} of bounded depth PST structures by recursively using the efficient switching technique of Section \ref{sec:switching} as a replacement for Equation \ref{eq:ctw_rec}.
This gives, for all $n \in \mathbb{N}$, for all $x_{1:n} \in \cX^n$, the following recursion for $D > 0$,
\begin{equation}\label{eq:cts_recursion_math}
\footnotesize
\cts^c_D(x_{1:n}) := \hspace{-17pt} \sum_{i_{1:{n_c}} \in \{ 0, 1\}^{n_c}} \hspace{-13pt} w_c(i_{1:{n_c}}) \prod_{k=1}^{n_c} \left[ \mathbb{I}[i_k \hspace{-3pt}=\hspace{-2.5pt} 0] \, \frac{\xi_{KT}( [x^c_{1:n}]_{1:k}) }{\xi_{KT}( [x^c_{1:n}]_{ 0$, the following set of recursive update equations
\vspace{0.2em}
\begin{eqnarray}
\cts_D^c(x_{1:n}) & \leftarrow & k_{c} \, \xi_{KT}(x^c_n \cdbar x^c_{0$, where $\xi_{KT}(x^c_n \cdbar x^c_{0$} \\
\xi_{KT}(x^c_{1:n}) &\text{if $D = 0$},
\end{cases}
\end{equation}
for any sub-context $c \in \cS \cup \tilde{\cS}$.
Next define $\cS' := \{ s \in \cS \,:\, l(s) < D \}$.
Now, by repeatedly applying Equation \ref{eq:cts_lb}, starting with $\cts_D(x_{1:n})$ (which recall is defined as $\cts^{\epsilon}_D(x_{1:n})$) and continuing until no more $\cts(\cdot)$ terms remain, we can conclude
\begin{eqnarray*}
\cts_D(x_{1:n}) &\geq& \left( \prod_{c \in \tilde{\cS}} w_c(1_{1:{n_c}}) \right) \left( \prod_{s \in \cS'} w_s(0_{1:{n_s}}) \right) \left( \prod_{s\in\cS} \xi_{KT}(x^s_{1:n}) \right) \\
&=& \left( \prod^{d(\cS)}_{k=0} \prod_{c \in \cS' \cup \tilde{\cS} \,:\, l(c) = k} w_c(1_{1:{n_c}}) \right) \left( \prod_{s\in\cS} \xi_{KT}(x^s_{1:n}) \right) \\
&\geq& \left( 2^{-\Gamma_D(\cS)} \prod^{d(\cS)}_{k=0} \prod_{c \in \cS' \cup \tilde{\cS} \,:\, l(c) = k} \frac{w_c(1_{1:{n_c}})}{w_c(1_{1:\min(n_c, 1)})} \right) \left( \prod_{s\in\cS} \xi_{KT}(x^s_{1:n}) \right)\\
&\geq& \left( 2^{-\Gamma_D(\cS)} \prod^{d(\cS)}_{k=0} \prod^{n}_{t=2} \frac{t-1}{t} \right) \left( \prod_{s\in\cS} \xi_{KT}(x^s_{1:n}) \right) \\
&=& 2^{-\Gamma_D(\cS)} n^{-(d(\cS)+1)} \left( \prod_{s\in\cS} \xi_{KT}(x^s_{1:n}) \right).
\end{eqnarray*}
The first equality follows by noting that Definition \ref{def:switch_distribution} implies $w_c(0_{1:t}) = w_c(1_{1:t})$ for all $t \in \mathbb{N}$ and rearranging.
The second inequality follows from $|\cS' \cup \tilde{\cS}|=\Gamma_D(\cS)$, $w_c(1)=\tfrac{1}{2}$ and that either $w_c(1_{1:n_c})=w_c(\epsilon)=1$ if $n_c = 0$ or $w_c(1_{1:n_c}) = \tfrac{1}{2} \times \dots$ for $n_c > 0$.
The last inequality follows from the observation that the context associated with each symbol in $x_{1:n}$ matches at most one context $c \in \cS' \cup \tilde{\cS}$ of each specific length $0 \leq k \leq d(\cS)$.
The final equality follows upon simplification of the telescoping product.
Hence,
\begin{equation}\label{eq:cts_partial_proof_bound}
-\log_2 \cts(x_{1:n}) \leq \Gamma_D(\cS) + [d(\cS)+1]\log_2 n -\log_2 \left( \prod_{s\in\cS} \xi_{KT}(x^s_{1:n}) \right).
\end{equation}
Finally, the proof is completed by noting that Equation \ref{eq:pst_parameter_redundancy} implies
\begin{equation*}
-\log_2 \left( \prod_{s\in\cS} \xi_{KT}(x^s_{1:n}) \right) \leq |\cS|\gamma(\tfrac{n}{|\cS|}) - \log_2 \Pr(x_{1:n} \,|\, \cS, \Theta_{\cS}),
\end{equation*}
and then combining the above with Equation \ref{eq:cts_partial_proof_bound}.
\end{proof}
\end{reptheorem}
\fi
\end{document}
*