%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Monotone Conditional Complexity Bounds %%
%% on Future Prediction Errors %%
%% Alexey Chernov & Marcus Hutter: Start 2005 %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newif\ifconf\conffalse % conference style versus no-style
%-------------------------------%
% Document-Style %
%-------------------------------%
\ifconf
\documentclass[oribibl,envcountsame]{llncs}
\usepackage{amsmath,amssymb}
\sloppy\lineskip=0pt
\else
\documentclass[12pt,twoside]{article}
\usepackage{amsmath,amssymb}
\usepackage{amsthm}
\topmargin=-10mm \oddsidemargin=5mm \evensidemargin=5mm
\textwidth=15cm \textheight=22.7cm \sloppy\lineskip=0pt
\sloppy\lineskip=0pt
\fi %ifconf
%-------------------------------%
% Spacings %
%-------------------------------%
\edef\,{\thinspace}
\edef\;{\thickspace}
\edef\!{\negthinspace}
\def\dispmuskip{\thinmuskip= 3mu plus 0mu minus 2mu \medmuskip= 4mu plus 2mu minus 2mu \thickmuskip=5mu plus 5mu minus 2mu}
\def\textmuskip{\thinmuskip= 0mu \medmuskip= 1mu plus 1mu minus 1mu \thickmuskip=2mu plus 3mu minus 1mu}
\textmuskip
\def\beq{\dispmuskip\begin{equation}} \def\eeq{\end{equation}\textmuskip}
\def\beqn{\dispmuskip\begin{displaymath}}\def\eeqn{\end{displaymath}\textmuskip}
\def\bqa{\dispmuskip\begin{eqnarray}} \def\eqa{\end{eqnarray}\textmuskip}
\def\bqan{\dispmuskip\begin{eqnarray*}} \def\eqan{\end{eqnarray*}\textmuskip}
\def\paradot#1{\vspace{1.3ex plus 0.5ex minus 0.5ex}\noindent{\bf{#1.}}}
\def\paranodot#1{\vspace{1.3ex plus 0.5ex minus 0.5ex}\noindent{\bf{#1}}}
%-------------------------------%
% Environments %
%-------------------------------%
\ifconf
\renewcommand{\proofname}{\bfseries Proof}
\def\qedx{\hspace*{\fill}$\Box\quad$}
\spnewtheorem*{note*}{Note}{\itshape}{\rmfamily} %% unnumbered note for llncs
\else
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\theoremstyle{remark}
\newtheorem*{note*}{Note}
\renewcommand{\proofname}{\bfseries Proof}
\def\qedsymbol{$\Box\quad$}
\def\qedx{}
\newenvironment{keywords}%
{\centerline{\bf\small Keywords}\begin{quote}\small}%
{\par\end{quote}\vskip 1ex}
\fi
%-------------------------------%
% Inequalities %
%-------------------------------%
\makeatletter
\@ifundefined{@tempboxa}{\@nameuse{newbox}\@tempboxa}{}
\@ifundefined{@tempboxb}{\@nameuse{newbox}\@tempboxb}{}
\def\xxstackrel#1#2#3#4#5#6{\setbox\@tempboxa=\hbox{$#3$}
\mathrel{%
\setbox\@tempboxb=\hbox{%
\m@th \hbox {\ooalign {%
\raise #2\hbox to \wd\@tempboxa%
{\hfil$\scriptstyle #1$\hfil}\crcr%
\lower #4\box\@tempboxa}}}%
\ht\@tempboxb=#5
\dp\@tempboxb=#6
\box\@tempboxb\relax
}}
\makeatother
\def\equa{\xxstackrel{+}{1.2ex}{=}{0ex}{1.6ex}{0.45ex}}
\def\leqa{\xxstackrel{+}{1.4ex}{\leq}{0.1ex}{2ex}{0.45ex}}
\def\geqa{\xxstackrel{+}{1.4ex}{\geq}{0.1ex}{2ex}{0.45ex}}
\def\eqm{\xxstackrel{\times}{1.1ex}{=}{0ex}{1.95ex}{0ex}}
\def\leqm{\xxstackrel{\hspace{-0.2ex}\times}{1.4ex}{\leq}{0.1ex}{2ex}{0.45ex}}
\def\geqm{\xxstackrel{\hspace{0.1ex}\times}{1.4ex}{\geq}{0.1ex}{2ex}{0.45ex}}
\def\ngeqm{\xxstackrel{\hspace{0.1ex}\times}{1.4ex}{\not\geq}{0.1ex}{2ex}{0.45ex}}
\def\leqq{\xxstackrel{?}{1.4ex}{\leq}{0.1ex}{2ex}{0.45ex}}
\def\leqbl{\xxstackrel{\!(<)\,}{1.3ex}{=}{0.2ex}{2ex}{0.45ex}}
%-------------------------------%
% More Macro-Definitions %
%-------------------------------%
\def\eps{\varepsilon}
\def\nq{\hspace{-1em}}
\def\odt{{\textstyle{1\over 2}}}
\def\SetB{I\!\!B}
\def\SetR{I\!\!R}
\def\SetN{I\!\!N}
\def\D{{\cal D}}
\def\E{{\bf E}} % Expectation value
\def\M{{\cal M}} % Set of prob. distributions
\def\P{{\bf P}} % Expectation value
\def\X{{\cal X}} % input/perception set/alphabet
\def\qmbox#1{{\quad\mbox{#1}\quad}}
\def\qqmbox#1{{\qquad\mbox{#1}\qquad}}
\def\l{{\ell}} % length of string or program
\def\lb{{\log_2}}
\def\pb#1{}
\def\hh#1{{\dot{#1}}} % historic I/O
\def\eps{\varepsilon} % for small positive number
\def\epstr{\epsilon} % for empty string
\def\toinfty#1{\stackrel{\smash{#1}\to\infty}{\longrightarrow}}
\def\a{\alpha}
\def\o{\omega}
\def\e{{\rm e}} % natural e
\def\KM{K\!M}
\def\Km{K\!m}
\def\KP{K}
\def\C{C}
\def\KPM{{K_*}}
\newcommand{\kpc}[2]{\KP(#1|#2)} % usual conditional compl.
\newcommand{\kpm}[2]{\KPM(#1|#2*)} % new condditional compl.
\newcommand{\bin}{\{0,1\}}
\newcommand{\fin}{\bin^\ast}
\newcommand{\infin}{\bin^\infty}
\newcommand{\emptyword}{\epstr} %{\Lambda}
\newcommand*{\pair}[1]{\langle #1\rangle}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% T i t l e - P a g e %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ifconf
\title{Monotone Conditional Complexity Bounds \\ on Future Prediction Errors}
\author{Alexey Chernov and Marcus Hutter}
\authorrunning{Alexey Chernov and Marcus Hutter}
\institute{IDSIA, Galleria 2, CH-6928 Manno-Lugano, Switzerland
\footnote[0]{This work was supported by SNF grants
200020-100259 (to J\"urgen Schmidhuber),
2100-67712 and 200020-107616.}\\
\{alexey,marcus\}@idsia.ch, \ http://www.idsia.ch/$^{_{_\sim}}\!$\{alexey,marcus\}
}\maketitle
\else
\title{\normalsize\sc Technical Report \hfill IDSIA-16-05
\vskip 2mm\bf\Large\hrule height5pt \vskip 6mm
Monotone Conditional Complexity Bounds \\
on Future Prediction Errors
\vskip 6mm \hrule height2pt \vskip 5mm}
\author{{{\bf Alexey Chernov} and {\bf Marcus Hutter}}\\[3mm]
\normalsize IDSIA, Galleria 2, CH-6928\ Manno-Lugano, Switzerland%
\thanks{This work was supported by SNF grants
200020-100259 (to J\"urgen Schmidhuber),
2100-67712 and 200020-107616.}\\
\normalsize \{alexey,marcus\}@idsia.ch, \ http://www.idsia.ch/$^{_{_\sim}}\!$\{alexey,marcus\} }
\date{18 July 2005}
\maketitle
\fi
\begin{abstract}
We bound the future loss when predicting any (computably)
stochastic sequence online. Solomonoff finitely bounded the total
deviation of his universal predictor $M$ from the true
distribution $\mu$ by the algorithmic complexity of $\mu$. Here we
assume we are at a time $t>1$ and already observed $x=x_1...x_t$.
We bound the future prediction performance on $x_{t+1}x_{t+2}...$
by a new variant of algorithmic complexity of $\mu$ given $x$,
plus the complexity of the randomness deficiency of $x$. The new
complexity is monotone in its condition in the sense that this
complexity can only decrease if the condition is prolonged. We
also briefly discuss potential generalizations to Bayesian model
classes and to classification problems.
\end{abstract}
\ifconf\else
\begin{keywords}
Kolmogorov complexity,
posterior bounds,
online sequential prediction,
Solomonoff prior,
monotone conditional complexity,
total error,
future loss,
randomness deficiency.
\end{keywords}
\fi
\ifconf\else\newpage\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}\label{secInt}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%-------------------------------%
%\paradot{Historical Survey}
%-------------------------------%
We consider the problem of online=sequential predictions. We
assume that the sequences $x=x_1x_2x_3...$ are drawn from some ``true''
but unknown probability distribution $\mu$. Bayesians proceed by
considering a class $\M$ of models=hypotheses=distributions,
sufficiently large such that $\mu\in\M$, and a prior over $\M$.
%
Solomonoff considered the truly large class that contains all
computable probability distributions \cite{Solomonoff:64}.
He showed that his universal distribution $M$ converges rapidly to
$\mu$ \cite{Solomonoff:78}, i.e.\ predicts well in any environment
as long as it is computable or can be modeled by a computable
probability distribution (all physical theories are of this sort).
%
$M(x)$ is roughly $2^{-\KP(x)}$, where $\KP(x)$ is the length of
the shortest description of $x$, called Kolmogorov complexity of
$x$. Since $\KP$ and $M$ are incomputable, they have to be
approximated in practice. See e.g.\
\cite{Schmidhuber:02speed,Hutter:04uaibook,Li:97,Cilibrasi:05} and
references therein.
%
The universality of $M$ also precludes useful statements of the
prediction quality at particular time instances $n$
\cite[p62]{Hutter:04uaibook}, as opposed to simple classes like
i.i.d.\ sequences (data) of size $n$, where accuracy is typically
$O(n^{-1/2})$.
%
Luckily, bounds on the expected {\em total}=cumulative loss
(e.g.\ number of prediction errors) for $M$ can be derived
\cite{Solomonoff:78,Hutter:02spupper,Hutter:03optisp},
which is often sufficient in an online setting. The bounds are in
terms of the (Kolmogorov) complexity of $\mu$. For instance, for
deterministic $\mu$, the number of errors is (in a sense tightly)
bounded by $\KP(\mu)$ which measures in this case the information
(in bits) in the observed infinite sequence $x$.
%-------------------------------%
\paradot{What's new}
%-------------------------------%
In this paper we assume we are at a time $t>1$ and already
observed $x=x_1...x_t$. Hence we are interested in the future
prediction performance on $x_{t+1}x_{t+2}...$, since typically we
don't care about past errors.
%
If the total loss is finite, the future loss must necessarily be
small for large $t$. In a sense the paper intends to quantify this
apparent triviality.
%
If the complexity of $\mu$ bounds the total loss, a natural guess
is that something like the conditional complexity of $\mu$ given
$x$ bounds the future loss. (If $x$ contains a lot of (or even all)
information about $\mu$, we should make fewer (no) errors anymore.)
%
Indeed, we prove two bounds of this kind but with additional
terms describing structural properties of $x$.
These additional terms appear since the total loss is bounded
only in expectation,
and hence the future loss is small only for ``most'' $x_1...x_t$.
In the first bound (Theorem~\ref{thm:lbnd}), the additional term is the
complexity of the length of $x$ (a kind of worst-case estimation).
The second bound (Theorem~\ref{thm:KPMbound}) is finer:
the additional term is the complexity of the randomness deficiency of $x$.
The advantage is that the deficiency is small for ``typical'' $x$
and bounded on average (in contrast to the length).
But in this case the
conventional conditional complexity turned out to be unsuitable.
%
So we introduce a new natural modification of conditional
Kolmogorov complexity, which is monotone as a function of
condition. Informally speaking, we require programs
(=descriptions) to be consistent in the sense that if a program
generates some $\mu$ given $x$, then it must generate the same
$\mu$ given any prolongation of $x$.
%
The new posterior bounds also significantly improve the previous
total bounds.
%-------------------------------%
\paradot{Contents}
%-------------------------------%
The paper is organized as follows. Some basic notation and
definitions are given in Sections~\ref{secNot} and~\ref{secSetup}.
In Section~\ref{secPostBnd} we prove and discuss the length-based
bound Theorem~\ref{thm:lbnd}. In Section~\ref{secKpmBnd} we show
why a new definition of complexity is necessary and formulate the
deficiency-based bound Theorem~\ref{thm:KPMbound}. We discuss the
definition and basic properties of the new complexity in
Section~\ref{secKpmDisc}, and prove Theorem~\ref{thm:KPMbound} in
Section~\ref{secKpmProof}. We briefly discuss potential
generalizations to general model classes $\M$ and classification
in the concluding Section~\ref{secDisc}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation \& Definitions}\label{secNot}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We essentially follow the notation of \cite{Li:97,Hutter:04uaibook}.
%------------------------------%
\paradot{Strings and natural numbers}
%------------------------------%
We write $\X^*$ for the set of finite strings over a finite
alphabet $\X$, and $\X^\infty$ for the set of infinite sequences.
The cardinality of a set $\cal S$ is denoted by $|{\cal S}|$. We
use letters $i,k,l,n,t$ for natural numbers, $u,v,x,y,z$ for
finite strings, $\epstr$ for the empty string, and
$\a=\a_{1:\infty}$ etc.\ for infinite sequences. For a string $x$
of length $\l(x)=n$ we write $x_1x_2...x_n$ with $x_t\in\X$ and
further abbreviate $x_{k:n}:=x_kx_{k+1}...x_{n-1}x_n$ and
$x_{}\frac{k}{n}\}$ is enumerable.
A measure $\mu$ is called \emph{computable} if
it is enumerable and co-enumerable
and the set $\{x\mid\mu(x)=0\}$ is decidable
(i.\,e.\ enumerable and co-enumerable).
%------------------------------%
\paradot{Prefix Kolmogorov complexity}
%------------------------------%
The conditional prefix complexity
$\kpc{y}{x}:=\min\{\l(p):U(p,x)=y\}$
is the length of the shortest binary
(self-delimiting) program $p\in\fin$ on a universal prefix Turing
machine $U$ with output $y\in\X^*$ and input $x\in\X^*$
\cite{Li:97}. $\KP(x):=\kpc{x}{\epstr}$.
%
For non-string objects $o$ we define $\KP(o):=\KP(\langle
o\rangle)$, where $\langle o\rangle\in\X^*$ is some standard code
for $o$. In particular, if $(f_i)_{i=1}^\infty$ is an enumeration
of all (co-)enumerable functions, we define
$\KP(f_i):=\KP(i)$.
%
We need the following properties: %
The co-enumerability of $\KP$, %
the upper bounds $\kpc{x}{\l(x)}\leqa\l(x)\lb|\X|$ and $\KP(n)\leqa 2\lb n$, %
Kraft's inequality $\sum_x 2^{-\KP(x)}\leq 1$, %
the lower bound $\KP(x)\geq l(x)$ for ``most'' $x$ %
(which implies $\KP(n)\toinfty{n}\infty$), %
extra information bounds $\kpc{x}{y}\leqa \KP(x)\leqa \KP(x,y)$, %
subadditivity $\KP(xy)\leqa \KP(x,y)\leqa \KP(y)+\kpc{x}{y}$, %
information non-increase $\KP(f(x))\leqa \KP(x)+\KP(f)$
for computable $f:\X^*\to\X^*$, %
and coding relative to a probability distribution (MDL):
if $P:\X^*\to[0,1]$ is enumerable and $\sum_x P(x)\leq 1$,
then
$\KP(x)\leqa -\lb P(x)+\KP(P)$.
%------------------------------%
\paradot{Monotone and Solomonoff complexity}
%------------------------------%
The monotone complexity $\Km(x):=\min\{\l(p):U(p)=x*\}$ is the
length of the shortest binary (possibly non-halting) program
$p\in\fin$ on a universal monotone Turing machine $U$ which
outputs a string starting with $x$. Solomonoff's prior
$M(x):=\sum_{p:U(p)=x*}2^{-\l(p)}=:2^{-\KM(x)}$ is the probability
that $U$ outputs a string starting with $x$ if provided with fair
coin flips on the input tape. Most complexities coincide within an
additive term $O(\log\l(x))$, e.g.\
$\kpc{x}{\l(x)}\leqa\KM(x)\leq\Km(x)\leq \KP(x)$, hence similar
relations as for $\KP$ hold.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Setup}\label{secSetup}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%------------------------------%
\paradot{Convergent predictors}
%------------------------------%
We assume that $\mu$ is a ``true''\footnote{Also called {\em
objective} or {\em aleatory} probability or {\em chance}.}
sequence generating measure, also called environment. If we know
the generating process $\mu$, and given past data $x_{1$. For illustration we
will sometimes loosely interpret $D_{1:\infty}$ and other
quantities as the number of prediction errors, as for the
error-loss they are closely related to it \cite{Hutter:01alpha}.
%------------------------------%
\paradot{Bayes mixtures}
%------------------------------%
A Bayesian considers a class of distributions $\M :=
\{\nu_1,\nu_2,...\}$, large enough to contain $\mu$, and uses
the Bayes mixture
\beq\label{xidefsp}
\xi(x) \;:=\;
\sum_{\nu\in\M}w_\nu\!\cdot\!\nu(x),\quad
\sum_{\nu\in\M}w_\nu=1,\quad w_\nu>0.
\eeq
for prediction, where $w_\nu$ can be interpreted as the prior of
(or initial belief in) $\nu$. The dominance
\beq\label{unixi}
\xi(x) \;\geq\;
w_\mu\!\cdot\!\mu(x) \quad \forall x\in\X^*
\eeq
is its most important property. Using $\rho=\xi$ for prediction,
this implies $D_{1:\infty}\leq\ln w_\mu^{-1}<\infty$, hence
$\xi_t\to\mu_t$. If $\M$ is chosen sufficiently large, then
$\mu\in\M$ is not a serious constraint.
%------------------------------%
\paradot{Solomonoff prior}
%------------------------------%
So we consider the largest (from a computational point of view)
relevant class, the class $\M_U$ of all enumerable semimeasures
(which includes all computable probability distributions) and
choose $w_\nu=2^{-\KP(\nu)}$ which is biased towards simple
environments (Occam's razor). This gives us Solomonoff-Levin's
prior $M$ \cite{Solomonoff:64,Zvonkin:70} (this definition
coincides within an irrelevant multiplicative constant with the
one in Section \ref{secNot}). In the following we assume
$\M=\M_U$, $\rho=\xi=M$, $w_\nu=2^{-\KP(\nu)}$ and $\mu\in\M_U$
being a computable (proper) measure, hence $M(x)\geq
2^{-K(\mu)}\mu(x)\,\forall x$ by (\ref{unixi}).
%------------------------------%
\paradot{Prediction of deterministic environments}
%------------------------------%
Consider a computable sequence $\a=\a_{1:\infty}$ ``sampled from
$\mu\in\M$'' with $\mu(\a)=1$, i.e.\ $\mu$ is deterministic,
then from (\ref{unixi}) we get
\beq\label{detBnd}
\sum_{t=1}^\infty |1-M(\a_t|\a_{-\infty$ for all sequences), i.e.\ iff
\beqn
\sup_n\Big|\sum_{t=1}^n\log{\mu(\a_t|\a_{c$ or $\mu(1|\a_{c$ for
$c:=[3\ln 2]^{-1}<\odt$. Since $\mu$ is computable, we can find
(effectively) $b\in\bin$ such that $\mu(b|\a_{c$. Put
$\a_l=\bar b$.
Let us estimate $M(\bar \a_l|\a_{ 0\}$.
This set is prefix free and decidable. Therefore
$P(l)=M(\a_{c$, we get
\bqan
\sum_{b\in\bin}\!\!
\mu(b|\a_{d\}$
and put $\nu$ to be $2^d\mu$ on prolongations of $z\in S$;
this is possible since $S$ has $\mu$-measure $2^{-d}$.
Then we have $\nu(z)\le C\cdot M(z)$ by universality of $M$.
However, the constant $C$ depends on $\mu$ and also on $d$.
To make the dependence explicit,
we repeat the above construction for all numbers $d$
and all semimeasures $\mu^T$,
obtaining semimeasures $\nu_{d,T}$,
and take $\nu=\sum 2^{-\KP(d)}\cdot 2^{-\KP(T)}\nu_{d,T}$.
This construction would give us the term $\KP(\mu)$
in the right-hand side of Theorem \ref{thm:KPMbound}.
To get $\kpm{\mu}{x}$,
we need a more complicated strategy:
instead of a sum of semimeasures $\nu_{d,T}$,
for every fixed $d$
we sum ``pieces'' of $\nu_{d,T}$ at each point $z$,
with coefficients depending on $z$ and $T$.
Now proceed with the formal proof.
Let $\{\mu^T\}_{T\in\SetN}$ be any
(effective) enumeration of all enumerable
semimeasures.
%
For any integer $d$ and any $T$, put
\beqn
S_{d,T} \;:=\; \{z\mid \sum_{v\in\X^{\l(z)}\setminus\{z\}}
\mu^T(v) + 2^{-d}M(z) > 1\}\,.
\eeqn
The set $S_{d,T}$ is enumerable given $d$ and $T$.
Let $E$ be the optimal $\KPM$-correct set
(satisfying all three requirements),
$E(p,z)$ is the corresponding partial computable function.
%
For any $z\in\X^*$ and $T$, put
\beqn
\lambda(z,T) \;:=\;
\max\{2^{-\l(p)}\mid\exists k\le\l(z)\colon
z_{1:k}\in S_{d,T}\text{ and } E(p,z_{1:k})=T\}
\eeqn
(if there is no such $p$, then $\lambda(z,T)=0$).
%
Put
\beqn
\tilde\nu_d(z) \;:=\; \sum_{T}
\lambda(z,T)\cdot 2^{d}\mu^{T}(z)\,.
\eeqn
Obviously, this value is enumerable. It is not a semimeasure, but
it has the following property (we omit the proof).
\begin{claim}\label{claim:prefix}
For any prefix-free set $A$,
\beqn
\sum_{z\in A}\tilde\nu_d(z) \;\le\; 1\,.
\eeqn
\end{claim}
This implies that there exists an enumerable semimeasure $\nu_d$
such that $\nu_d(z)\ge\tilde\nu_d(z)$ for all $z$.
Actually, to enumerate $\nu_d$,
one enumerates $\tilde\nu_d(z)$ for all $z$
and at each step sets the current value of $\nu_d(z)$
to the maximum of the current values of $\tilde\nu_d(z)$
and $\sum_{u\in\X}\nu_d(zu)$.
Trivially, this provides $\nu_d(z)\ge\sum_{u\in\X}\nu_d(zu)$.
To show that $\nu_d(\emptyword)\le 1$,
let us note that at any step of enumeration the current value of
$\nu_d(\emptyword)$
is the sum of current values $\tilde\nu_d(z)$ over some
prefix-free set, and thus is bounded by $1$.
%
Put
\beqn
\nu(z) \;:=\; \sum_{d} 2^{-\KP(d)}\nu_{d}(z)\,.
\eeqn
Clearly, $\nu$ is an enumerable semimeasure,
thus $\nu(z)\leqm M(z)$.
%
Let $\mu$ be an arbitrary computable measure,
and
$x,y\in\X^*$.
Let $p\in\fin$ be a string such that $\kpm{\mu}{x}=\l(p)$,
$E(p,x)=T$, and $\mu=\mu^T$.
%
Put $d=\lceil d_\mu(x)\rceil-1$, i.e.,
$d_\mu(x)-1\le d < d_\mu(x)$. Hence $\mu(x) < 2^{-d}M(x)$.
Since $\mu=\mu^{T}$ is a measure,
we have
$\sum_{v\in\X^{\l(x)}} \mu^T(v)=1$,
and therefore $x\in S_{d,T}$.
By definition, $\lambda(xy,T)\ge 2^{-\l(p)}$,
thus $\tilde\nu_{d}(xy)\ge 2^{-\l(p)}2^d\mu(xy)$,
and
\beqn
2^{-\KP(d)} 2^{-\l(p)} 2^d\mu(xy)
\;\le\; \nu(xy) \;\leqm\; M(xy)\,.
\eeqn
After trivial transformations we get
\beqn
\lb\frac{\mu(y|x)}{M(y|x)} \;\leqa \kpm{\mu}{x}+\KP(d)\,,
\eeqn
which completes the proof of Theorem~\ref{thm:KPMbound}.
\ifconf\newpage\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Discussion}\label{secDisc}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%-------------------------------%
\paradot{Conclusion}
%-------------------------------%
We evaluated the quality of predicting a stochastic sequence at an
intermediate time, when some beginning of the sequence has been
already observed, estimating the future loss of the universal
Solomonoff predictor $M$. We proved general upper bounds for the
discrepancy between conditional values of the predictor $M$ and
the true environment $\mu$, and demonstrated a kind of tightness
for these bounds. One of the bounds is based on a new variant of
conditional algorithmic complexity $\KPM$, which has interesting
properties in its own. In contrast to standard prefix complexity
$\KP$, $\KPM$ is a monotone function of conditions:
$\kpm{y}{xz}\leq\kpm{y}{x}$.
%-------------------------------%
\paradot{General Bayesian posterior bounds}
%-------------------------------%
A natural question is whether posterior bounds for general Bayes
mixtures based on general $\M\ni\mu$ could also be derived. From
the (obvious) posterior representation $\xi(y|x) = \sum_{\nu\in\M}
w_\nu(x)\nu(y|x) \geq w_\mu(x)\mu(y|x)$, where $w_\nu(x) :=
w_\nu{\nu(x)\over\xi(x)}$ is the posterior belief in $\nu$ after
observing $x$, the bound $D_{l:\infty}\leq\ln w_\mu(\o_{