%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Universal Convergence of Semimeasures %%
%% on Individual Random Sequences %%
%% Marcus Hutter: Start: 18.11.03 %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%-------------------------------%
% Document-Style %
%-------------------------------%
\documentclass[12pt,twoside]{article}
\usepackage{latexsym}
\pagestyle{myheadings}
\markboth{\sc Marcus Hutter \& Andrej Muchnik
}{\sc On Semimeasures Predicting Martin-L\"of Random Sequences}
\setcounter{tocdepth}{4} \setcounter{secnumdepth}{2}
\topmargin=0mm \oddsidemargin=5mm \evensidemargin=5mm
\textwidth=15cm \textheight=22cm
\sloppy\lineskip=0pt
%-------------------------------%
% My Math-Spacings %
%-------------------------------%
\iftrue
\def\,{\mskip 3mu} \def\>{\mskip 4mu plus 2mu minus 4mu} \def\;{\mskip 5mu plus 5mu} \def\!{\mskip-3mu}
\def\dispmuskip{\thinmuskip= 3mu plus 0mu minus 2mu \medmuskip= 4mu plus 2mu minus 2mu \thickmuskip=5mu plus 5mu minus 2mu}
\def\textmuskip{\thinmuskip= 0mu \medmuskip= 1mu plus 1mu minus 1mu \thickmuskip=2mu plus 3mu minus 1mu}
\textmuskip
\def\eqsp{\vspace{0ex}}
\def\beq{\dispmuskip\eqsp\begin{equation}} \def\eeq{\eqsp\end{equation}\textmuskip}
\def\beqn{\dispmuskip\eqsp\begin{displaymath}}\def\eeqn{\eqsp\end{displaymath}\textmuskip}
\def\bqa{\dispmuskip\eqsp\begin{eqnarray}} \def\eqa{\eqsp\end{eqnarray}\textmuskip}
\def\bqan{\dispmuskip\eqsp\begin{eqnarray*}} \def\eqan{\eqsp\end{eqnarray*}\textmuskip}
\else
\def\beq{\begin{equation}} \def\eeq{\end{equation}}
\def\beqn{\begin{displaymath}}\def\eeqn{\end{displaymath}}
\def\bqa{\begin{eqnarray}} \def\eqa{\end{eqnarray}}
\def\bqan{\begin{eqnarray*}} \def\eqan{\end{eqnarray*}}
\fi
%-------------------------------%
% Macro-Definitions %
%-------------------------------%
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{tablex}[theorem]{Table}
\newtheorem{figurex}[equation]{Figure}
\newenvironment{keywords}{\centerline{\bf\small
Keywords}\vspace{-1ex}\begin{quote}\small}{\par\end{quote}\vskip
1ex}
\def\ftheorem#1#2#3{\begin{theorem}[#2]\label{#1} #3 \end{theorem} }
\def\fcorollary#1#2#3{\begin{corollary}[#2]\label{#1} #3 \end{corollary} }
\def\flemma#1#2#3{\begin{lemma}[#2]\label{#1} #3 \end{lemma} }
\def\fdefinition#1#2#3{\begin{definition}[#2]\label{#1} #3 \end{definition} }
\def\fproposition#1#2#3{\begin{proposition}[#2]\label{#1} #3 \end{proposition} }
\def\ftablex#1#2#3{\begin{tablex}[#2]\label{#1} #3 \end{tablex} }
\def\ffigurex#1#2#3#4{{#4}\begin{figurex}[#2]\label{#1}#3\end{figurex}}
\def\myparskip{\vspace{1.5ex plus 0.5ex minus 0.5ex}\noindent}
\def\paradot#1{\myparskip{\bfseries\boldmath{#1.}}}
\def\paranodot#1{\myparskip{\bfseries\boldmath{#1}}}
\def\toinfty#1{\stackrel{#1\to\infty}{\longrightarrow}}
\def\nq{\hspace{-1em}}
\def\qed{\hspace*{\fill}$\Box\quad$\vspace{1ex plus 0.5ex minus 0.5ex}}
\def\odt{{\textstyle{1\over 2}}}
\def\odf{{\textstyle{1\over 4}}}
\def\eps{\varepsilon} % for small positive number
\def\epstr{\epsilon} % for empty string
\def\qmbox#1{{\quad\mbox{#1}\quad}}
\def\argmax{\mathop{\rm arg\,max}} % maxarg
\def\argmin{\mathop{\rm arg\,min}} % minarg
\def\equa{\smash{\stackrel{\raisebox{0.8ex}{$\scriptstyle+$}}{\smash=}}}
\def\leqa{\smash{\stackrel{\raisebox{1ex}{$\scriptstyle\!\!\;+$}}{\smash\leq}}}
\def\geqa{\smash{\stackrel{\raisebox{1ex}{$\scriptstyle+\!\!\;$}}{\smash\geq}}}
\def\eqm{\smash{\stackrel{\raisebox{0.6ex}{$\scriptstyle\times$}}{\smash=}}}
\def\leqm{\smash{\stackrel{\raisebox{1ex}{$\scriptstyle\!\!\;\times$}}{\smash\leq}}}
\def\geqm{\smash{\stackrel{\raisebox{1ex}{$\scriptstyle\times\!\!\;$}}{\smash\geq}}}
\def\odn{{\textstyle{1\over n}}}
\def\v{}
\def\l{{\ell}} % length of string or program
\def\M{{\cal M}} % Set of prob. distributions
\def\X{{\cal X}} % input/perception set/alphabet
\def\E{{\bf E}} % Expectation value
\def\P{{\bf P}} % Expectation value
\def\B{\{0,1\}} % Binary set (or \SetB)
\def\Km{K\!m}
\def\MM{M} % Solomonoff's prior
\def\e{{\rm e}} % natural e
\def\a{\alpha}
\def\o{\omega}
\def\SetN{I\!\!N} \def\SetQ{I\!\!\!Q} \def\SetR{I\!\!R} \def\SetZ{Z\!\!\!Z}
\def\lb{\log}
\def\text#1{\mbox{\scriptsize{#1}}}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% T i t l e - P a g e %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{\vskip -25mm %\normalsize\sc Technical Report \hfill IDSIA-??-05
\vskip 2mm\bf\LARGE\hrule height5pt \vskip 3mm
\sc On Semimeasures Predicting Martin-L\"of Random Sequences%
\thanks{This work was partially supported by the Swiss National Science Foundation
(SNF grants 2100-67712 and 200020-107616) and the Russian Foundation
for Basic Research (RFBR grants N04-01-00427 and N02-01-22001).
Submitted 15.May'05. Revised 26.Sep'06.
A shorter version appeared in the proceedings
of the ALT 2004 conference \cite{Hutter:04mlconvx}.}
\vskip 4mm \hrule height2pt \vskip 0mm}
\author{
{\bf Marcus Hutter}\\[3mm]
\normalsize IDSIA, Galleria 2, CH-6928\ Manno-Lugano, Switzerland\\
\normalsize RSISE/ANU/NICTA, \ Canberra, ACT, 0200, \ Australia\\
\normalsize marcus@hutter1.net \hspace{11ex} http://www.hutter1.net\\[5mm]
{\bf Andrej Muchnik}\\[3mm]
\normalsize Institute of New Technologies, 10 Nizhnyaya Radischewskaya\\
\normalsize Moscow 109004, Russia \hfill
\normalsize muchnik@lpcs.math.msu.su %\\[5mm]
}
\date{14 October 2006}
\maketitle
\vspace{-5mm}
\begin{abstract}
\noindent
Solomonoff's central result on induction is that the prediction of
a universal semimeasure $\MM$ converges rapidly and with
probability 1 to the true sequence generating predictor $\mu$, if
the latter is computable. Hence, $M$ is eligible as a universal
sequence predictor in case of unknown $\mu$.
%
Despite some nearby results and proofs in the literature, the
stronger result of convergence for all (Martin-L{\"o}f) random
sequences remained open. Such a convergence result would be
particularly interesting and natural, since randomness can be
defined in terms of $\MM$ itself.
%
We show that there are universal semimeasures $M$ which do not
converge to $\mu$ on all $\mu$-random sequences, i.e.\ we give a
partial negative answer to the open problem.
%
We also provide a positive answer for some non-universal
semimeasures. We define the incomputable measure $D$ as a
mixture over all computable measures and the enumerable
semimeasure $W$ as a mixture over all enumerable nearly-measures.
We show that $W$ converges to $D$ and $D$ to $\mu$ on all random
sequences.
%
The Hellinger distance measuring closeness of two distributions
plays a central role.
\end{abstract}
\begin{keywords}
Sequence prediction;
Algorithmic Information Theory;
universal enumerable semimeasure;
mixture distributions;
predictive convergence;
Martin-L{\"o}f randomness;
supermartingales;
quasimeasures.
\end{keywords}
\pagebreak
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}\label{secIntro}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{quote}\it
``All difficult conjectures should be proved by reductio ad absurdum
arguments. For if the proof is long and complicated enough you are
bound to make a mistake somewhere and hence a contradiction will
inevitably appear, and so the truth of the original conjecture is
established QED.'' \par
\hfill --- {\sl Barrow's second `law' (2004)}
\end{quote}
%------------------------------%
%\paradot{Universal induction}
%------------------------------%
A sequence prediction task is defined as to predict the next
symbol $x_n$ from an observed sequence $x=x_1...x_{n-1}$.
%
The key concept to attack general prediction problems is
Occam's razor, and to a less extent Epicurus' principle of
multiple explanations. The former/latter may be interpreted as to
keep the simplest/all theories consistent with the observations
$x_1...x_{n-1}$ and to use these theories to predict $x_n$.
%
Solomonoff \cite{Solomonoff:64,Solomonoff:78} formalized and
combined both principles in his universal a priori semimeasure $M$
which assigns high/low probability to simple/complex environments
$x$, hence implementing Occam and Epicurus. Formally it can be
represented as a mixture of all enumerable semimeasures.
%
An abstract characterization of $M$ by Levin \cite{Zvonkin:70} is
that $M$ is a universal enumerable semimeasure in the sense that
it multiplicatively dominates all enumerable semimeasures.
%------------------------------%
%\paradot{Predictive convergence}
%------------------------------%
Solomonoff's \cite{Solomonoff:78} central result is that if the
probability $\mu(x_n|x_1...x_{n-1})$ of observing $x_n$ at time
$n$, given past observations $x_1...x_{n-1}$ is a computable
function, then the universal predictor $M_n:=M(x_n|x_1...x_{n-1})$
converges (rapidly!) {\em with $\mu$-probability 1} (w.p.1) for
$n\to\infty$ to the optimal/true/informed predictor
$\mu_n:=\mu(x_n|x_1...x_{n-1})$, hence $M$ represents a universal
predictor in case of unknown ``true'' distribution $\mu$.
%
Convergence of $M_n$ to $\mu_n$ w.p.1 tells us that $M_n$ is close
to $\mu_n$ for sufficiently large $n$ for almost all sequences
$x_1x_2...$. It says nothing about whether convergence is true for
any {\em particular} sequence (of measure 0).
%------------------------------%
%\paranodot{Martin-L{\"o}f randomness and convergence}
%------------------------------%
Martin-L{\"o}f (M.L.) randomness is the standard notion for
randomness of individual sequences \cite{MartinLoef:66,Li:97}. A
M.L.-random sequence passes {\em all} thinkable effective randomness
tests, e.g.\ the law of large numbers, the law of the iterated
logarithm, etc. In particular, the set of all $\mu$-random sequences
has $\mu$-measure 1.
%
It is natural to ask whether $M_n$ converges to $\mu_n$ (in
difference or ratio) individually for all M.L.-random sequences.
Clearly, Solomonoff's result shows that convergence may at most
fail for a set of sequences with $\mu$-measure zero. A convergence
result for M.L.-random sequences would be particularly interesting and
natural in this context, since M.L.-randomness can be defined in
terms of $M$ itself \cite{Levin:73random}.
%
Despite several attempts to solve this problem
\cite{Vovk:87,Vitanyi:00,Hutter:03unipriors}, it remained open
\cite{Hutter:03mlconv}.
%------------------------------%
%\paranodot{What's new?}
%------------------------------%
In this paper we construct an M.L.-random sequence and show the
existence of a universal semimeasure which does not converge on
this sequence, hence answering the open question negatively for
some $M$.
%
It remains open whether there exist (other) universal
semimeasures, probably with particularly interesting additional
structure and properties, for which M.L.-convergence holds.
%
The main positive contribution of this work is the construction of
a non-universal enumerable semimeasure $W$ which M.L.-converges to
$\mu$ as desired. As an intermediate step we consider the
incomputable measure $\hat D$, defined as a mixture over all
computable measures. We show M.L.-convergence of predictor $W$ to
$\hat D$ and of $\hat D$ to $\mu$.
%
The Hellinger distance measuring closeness of two predictive
distributions plays a central role in this work.
%------------------------------%
%\paradot{Contents}
%------------------------------%
The paper is organized as follows:
In Section~\ref{secUniM} we give basic notation and results (for
strings, numbers, sets, functions, asymptotics, computability
concepts, prefix Kolmogorov complexity), and define and discuss
the concepts of (universal) (enumerable) (semi)measures.
%
Section~\ref{secConv} summarizes Solomonoff's and G\'acs' results
on predictive convergence of $M$ to $\mu$ with probability 1. Both
results can be derived from a bound on the expected Hellinger sum.
We present an improved bound on the expected exponentiated
Hellinger sum, which implies very strong assertions on the
convergence rate.
%
In Section~\ref{secMLNconv} we investigate whether convergence for
all Martin-L{\"o}f random sequences hold.
We construct a $\mu$-M.L.-random sequence on which some universal
semimeasures $M$ do not converge to $\mu$. We give a non-constructive
and a constructive proof of different virtue.
%
In Section~\ref{secMLconv} we present our main positive result. We
derive a finite bound on the Hellinger sum between $\mu$ and $\hat
D$, which is exponential in the randomness deficiency of the
sequence and double exponential in the complexity of $\mu$. This
implies that the predictor $\hat D$ M.L.-converges to $\mu$.
%
Finally, in Section~\ref{secNUESW} we show that $W$ is non-universal
and asymptotically M.L.-converges to $\hat D$, and summarize the
computability, measure, and dominance properties of $M$, $D$, $\hat
D$, and $W$.
%
Section~\ref{secDisc} contains discussion and outlook.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation \& Universal Semimeasures $\MM$}\label{secUniM}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%------------------------------%
\noindent {\bf Strings.}
%------------------------------%
Let $i,k,n,t\in\SetN=\{1,2,3,...\}$ be natural numbers,
$x,y,z\in\X^*=\bigcup_{n=0}^\infty\X^n$ be finite strings of
symbols over finite alphabet $\X\ni a,b$.
We write $xy$ for the concatenation of string $x$ with $y$.
We denote strings $x$ of length $\l(x)=n$ by
$x=x_1x_2...x_n\in\X^n$ with $x_t\in\X$ and further abbreviate
$x_{k:n}:=x_k x_{k+1}...x_{n-1}x_n$ for $k\leq n$, and $x_{0 : \MM(x)\geq w_\nu\!\cdot\!\nu(x)\;\forall x\in\X^*$.}
\eeqn
}%------------------------------%
%
From now on we consider the (in a sense) largest class $\M$ which
is relevant from a constructive point of view (but see
\cite{Schmidhuber:00toe,Schmidhuber:02gtm,Hutter:03unipriors}
for even larger constructive classes), namely the class of {\em
all} semimeasures, which can be enumerated (=effectively be
approximated) from below:
\beq\label{Mclassdef}
\M:= \;\mbox{class of all enumerable semimeasures}.
\eeq
Solomonoff \cite[Eq.(7)]{Solomonoff:64} defined the universal
predictor $\MM(y|x)=M(xy)/M(x)$ with $M(x)$ defined as the
probability that the output of a universal monotone Turing machine
starts with $x$ when provided with fair coin flips on the input
tape.
%
Levin \cite{Zvonkin:70} has shown that this $M$ is a universal
enumerable semimeasure.
%
% Bayes mixtures
Another possible definition of $M$ is as a (Bayes) mixture
\cite{Solomonoff:64,Zvonkin:70,Solomonoff:78,Li:97,Hutter:03unipriors,Hutter:04uaibook}:
$\tilde M(x)=\sum_{\nu\in\M}2^{-K(\nu)}\nu(x)$, where $K(\nu)$ is
the length of the shortest program computing function $\nu$.
%
Levin \cite{Zvonkin:70} has shown that the class of {\em all}
enumerable semimeasures is enumerable (with repetitions), hence
$\tilde M$ is enumerable, since $K$ is co-enumerable. Hence
$\tilde\MM\in\M$, which implies
\beq\label{Mdom}
M(x) \;\geq\; w_{\tilde M} \tilde M(x)
\;\geq\; w_{\tilde M} 2^{-K(\nu)}\nu(x)
\;=\; w'_\nu\nu(x),
\qmbox{where} w'_\nu\eqm 2^{-K(\nu)}.
% \quad \forall x\in\X^*, \forall\nu\in\M
\eeq
Up to a multiplicative constant, $\MM$ assigns higher probability
to all $x$ than any other enumerable semimeasure.
All $M$ have the same very slowly decreasing (in $\nu$)
domination constants $w'_\nu$, essentially because $\MM\in\M$.
We drop the prime from $w'_\nu$ in the following.
%
The mixture definition $\tilde M$ immediately generalizes to
arbitrary weighted sums of (semi)measures over countable
classes other than $\M$, but the class may not contain the mixture, and
the domination constants may be rapidly decreasing. We will
exploit this for the construction of the non-universal semimeasure
$W$ in Sections~\ref{secMLconv} and \ref{secNUESW}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Predictive Convergence with Probability 1}\label{secConv}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The following convergence results for $M$ are well-known
\cite{Solomonoff:78,Li:97,Hutter:02spupper,Hutter:04uaibook}.
%------------------------------%
\ftheorem{thConv}{Convergence of $M$ to $\mu$ w.p.1}{
%------------------------------%
For any universal semimeasure $M$ and any computable measure $\mu$ it holds:
\beqn
\mbox{$M(x'_n|x_{0$, i.e.\
$0_{1:\infty}$ is $\mu$-random. On the other hand, one can show
that $M(0_{1$ makes the expression infinite for some (Bernoulli)
distribution $\mu$ (however we choose $\nu$). For $\nu=M$ the
expression can become already infinite for $\alpha>\odt$ and some
computable measure $\mu$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Non-Convergence in Martin-L{\"o}f Sense}\label{secMLNconv}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Convergence of $M(x_n|x_{\delta
\qmbox{or}
\left|{R'(\a_{1:n})\over R'(\a_{\delta
\eeqn
(or both) for a non-vanishing fraction of $n$, where
supermartingale $R':=\odt(R+r)$ and some $\delta>0$.
}%------------------------------%
% The lower bound requirement $\eps0$ is a supermartingale. We
prove that the Theorem holds for infinitely many $n$. It is easy
to refine the proof to a non-vanishing fraction of $n$'s. Assume
that ${R(\a_{1:n})\over R(\a_{1$ implies $R\to\infty$, $\eta<1$
implies $R\to 0$. Since $R$ is bounded, $\eta$ must be 1, hence
for sufficiently large $n_0$ we
have $|R(\a_{1:n})-R(\a_{\; 0
\eeqn
for sufficiently small $\eps$ and $\delta$.
Similarly for (the infinitely many) $n\geq n_0$ for which
$r(\a_{\; 0.
\eeqn
This shows that Lemma~\ref{thSMnonConv} holds for infinitely
many $n$. If we define $r$ zero off-sequence, i.e.\ $r(x)=0$ for
$x\neq\a_{1:\l(x)}$, then $r$ is a supermartingale, but a
non-enumerable one, since $\alpha$ is not computable. In the next
lemma we define an enumerable supermartingale $r$, which completes
the proof of Lemma~\ref{thSMnonConv}. Finally note that we could
have defined $R'={R+\gamma r\over 1+\gamma}$ with arbitrarily
small $\gamma>0$, showing that already a small contamination can
destroy convergence. This is no longer true for the constructive
proof below.
\qed
%------------------------------%
\flemma{lemESM}{Enumerable supermartingale}{
%------------------------------%
Let $M^t$ with $t=1,2,3,...$ be computable approximations of $M$,
which enumerate $M$, i.e.\ $M^t(x)\nearrow M(x)$ for
$t\to\infty$. For each $t$ define recursively a sequence
$\alpha^t$ similarly to (\ref{defalpha}) as $\a^t_n=0$ if
$M^t(\a^t_{0$ from universality of $M$ and computability of $\lambda$ show
that the conditions of Lemma~\ref{thSMnonConv} are satisfied.
Hence $R^(\!\,'\!\,^)(\a_{1:n})/R^(\!\,'\!\,^)(\a_{t \\
\nu^t(x0)\!+\!\nu^t(x1) & \mbox{if} & \l(x)\a_{1:\l(x)}^t
\eeqn
On-sequence, i.e.\ for $x=\a_{1:n}$, $\nu^t$ is somewhere
in-between $0$ and $2^{-\l(x)}$. Since sequence $\a:=\lim_t\a^t$
is $\lambda$.M.L.-random it contains $01$ infinitely often,
actually $\a_n\a_{n+1}=01$ for a non-vanishing fraction of $n$. In
the following we fix such an $n$. For $t\geq n$ we get
\beqn
\nu^t(\a_{\a_{1:n}\geq\a_{1:n}^t,\text{ since }\a_n=0\nq\nq\nq})
= \nu^t(\a_{n$ large enough such that $\a_{1:n+1}^t=\a_{1:n+1}$ we get:
\beqn
\nu^t(\a_{1:n})
= \nu^t(\a_{1:n}^t)
\geq \nu^t(\underbrace{\a_{1:n}^t 0}_{\nq\nq\nq<\a_{1:n+1}^t,\text{ since }\a_{n+1}=1\nq\nq\nq})
= 2^{-n-1}
\quad\Rightarrow\quad \nu(\a_{1:n})\geq 2^{-n-1}
\eeqn
This ensures $\nu(\a_{1:n})\geq 2^{-n-1}\geq\odt M(\a_{1:n})$ by
(\ref{arand}). Let $M$ be any universal semimeasure and
$0<\gamma<{1\over 5}$. Then $M'(x):=(1-\gamma)\nu(x)+\gamma
M(x)\,\forall x$ is also a universal semimeasure with
\bqan
M'(\a_n|\a_{\;\; {1\over 2}.
\eqan
For instance for $\gamma={1\over 9}$ we have
$M'(\a_n|\a_{0
\\
ii) & \mbox{for}\; \v p^1,...,\v p^m \in\SetR_+^N & h(\v p^1,\v p^m)
& \leq & \displaystyle 3\sum_{k=2}^m k^2\,h(\v p^{k-1},\v p^k)
\end{array}
\eeqn
}%------------------------------%
\paradot{Proof} $(i)$
For any $x,y,z\in\SetR$ and $\beta>0$, squaring the triangle
inequality $|x-y|\leq|x-z|+|z-y|$ and chaining it with the
binomial $2|x-z||z-y| \leq \beta(x-z)^2+\beta^{-1}(z-y)^2$ shows
$(x-y)^2\leq(1+\beta)(x-z)^2+(1+\beta^{-1})(z-y)^2$. $(i)$ follows
for $x=\sqrt{p_i}$, $y=\sqrt{q_i}$, and $z=\sqrt{r_i}$ and
summation over $i$.
$(ii)$ Applying $(i)$ for the triples $(\v p^k,\v p^{k+1},\v p^m)$
for and in order of $k=1,2,...,m-2$ with $\beta=\beta_k$ gives
\beqn
h(\v p^1,\v p^m) \;\leq\;
\sum_{k=2}^m \bigg[\prod_{j=1}^{k-2}(1\!+\!\beta_j^{-1})\bigg]
\!\cdot\! (1\!+\!\beta_{k-1}) \!\cdot\! h(\v p^{k-1},\v p^k)
\eeqn
For $\beta_k=k(k+1)$ we have $\ln\prod_{j=1}^{k-2}(1+\beta_j^{-1}) \leq
\sum_{j=1}^\infty\ln(1+\beta_j^{-1}) \leq \sum_{j=1}^\infty \beta_j^{-1} = 1$
and $1+\beta_{k-1}\leq k^2$, which completes the proof. The choice
$\beta_k=2^{K(k)}$ would lead to a bound with $1+2^{K(k)}$ instead
of $k^2$. \qed
%------------------------------%
%\paradot{Expected to individual bound}
%------------------------------%
% Integral test -> supermartingale -> universal supermartingale -> randomness deficiency
We need a way to convert expected bounds to bounds on individual
M.L.\ random sequences, sort of a converse of ``M.L.\ implies
w.p.1''. Consider for instance the Hellinger sum
$H(\o):=\sum_{t=1}^\infty h_t(\mu,\rho)/\ln w^{-1}$ between two
computable measures $\rho\geq w\!\cdot\!\mu$. Then $H$ is an
enumerable function and Lemma~\ref{lemHBounds} implies $\E[H]\leq
1$, hence $H$ is an integral $\mu$-test. $H$ can be increased to an enumerable
$\mu$-supermartingale $\bar H$. The universal $\mu$-supermartingale
$M/\mu$ multiplicatively dominates all enumerable supermartingales
(and hence $\bar H$).
Since $M/\mu\leq 2^{d_\mu(\o)}$, this implies the desired bound
$H(\o) \leqm 2^{d_\mu(\o)}$ for individual $\o$.
We give a self-contained direct proof, explicating all important
constants.
%------------------------------%
\flemma{lemE2I}{Expected to Individual Bound}{
%------------------------------%
Let $F(\o)\geq 0$ be an enumerable function and $\mu$ be an
enumerable measure and $\eps>0$ be co-enumerable. Then:
\beqn
\qmbox{If} \E_\mu[F] \;\leq\; \eps
\qmbox{then} F(\o) \;\leqm\; \eps\!\cdot\! 2^{K(\mu,F,\,^1\!/\eps)+d_\mu(\o)}
\quad \forall\o
\eeqn
where $d_\mu(\o)$ is the $\mu$-randomness deficiency of $\o$ and
$K(\mu,F,\,^1\!/\eps)$ is the length of the shortest program for $\mu$,
$F$, and $^1\!/\eps$.
}%------------------------------%
Lemma~\ref{lemE2I} roughly says that for $\mu$, $F$, and
$\eps\eqm\E_\mu[F]$ with short program ($K(\mu,F,^1\!/\eps)=O(1)$) and
$\mu$-random $\o$ ($d_\mu(\o)=O(1)$) we have $F(\o)\leqm
\E_\mu[F]$.
\paradot{Proof}
Let $F(\o)=\lim_{n\to\infty}F_n(\o)=\sup_n F_n(\o)$ be enumerated
by an increasing sequence of computable functions $F_n(\o)$.
$F_n(\o)$ can be chosen to depend on $\o_{1:n}$ only, i.e.\
$F_n(\o)=F_n(\o_{1:n})$ is independent of $\o_{n+1:\infty}$. Let
$\eps_n\!\!\searrow\eps$ co-enumerate $\eps$. We define
\beqn
\bar\mu_n(\o_{1:k}) \;:=\; \eps_n^{-1} \nq\sum_{\o_{k+1:n}\in\X^{n-k}}\nq
\mu(\o_{1:n})F_n(\o_{1:n}) \;\;\mbox{for}\;\; k\leq n,
\qmbox{and} \bar\mu_n(\o_{1:k})=0 \;\;\mbox{for}\;\; k>n.
\eeqn
$\bar\mu_n$ is a computable semimeasure for each $n$ (due to
$\E_\mu[F_n]\leq\eps$) and increasing in $n$, since
\bqan
\bar\mu_n(\o_{1:k}) & \!\!\geq\!\! & 0 \;=\; \bar\mu_{n-1}(\o_{1:k}) \qmbox{for} k\geq n \qmbox{and}
\\
\bar\mu_n(\o_{k_0$. $\delta_{k-1}\leq\delta_k$ implies
\beqn
{\hat\delta_{k-1}(x)\over\hat\delta_k(x)}
\;\leq\; {\delta_k(\epstr)\over\delta_{k-1}(\epstr)}
\;\leq\; {\delta_{k-1}(\epstr)+\eps_k\over\delta_{k-1}(\epstr)}
\;=\; 1+{\eps_k\over\delta_{k-1}(\epstr)}
\;\leq\; 1+{\eps_k\over\eps_O},
\eeqn
where $O:=\min\{i\in J_{k-1}\}=O(1)$. Note that $J_{k-1}\ni k_0$ is
not empty. Since $\hat\delta_{k-1}$ and $\hat\delta_k$ are
measures, Lemma~\ref{lemHBounds} applies and shows
$\E_{\hat\delta_{k-1}}[H^k]\leq
\ln(1+{\eps_k\over\eps_O})\leq {\eps_k\over\eps_O}$.
Exploiting $\eps_{k_0}\mu\leq\hat\delta_{k-1}$, this implies
$\E_\mu[H^k]\leq {\eps_k\over\eps_O\eps_{k_0}}$. Lemma
\ref{lemE2I} then implies $H^k(\o)\leqm
{\eps_k\over\eps_O\eps_{k_0}}\cdot
2^{K(\mu,H^k,\eps_O\eps_{k_0}/\eps_k)+d_\mu(\o)}$.
Similarly as in $(i)$ we can bound
\beqn
K(\mu,H^k,\eps_{k_0}/\eps_O\eps_k) \leqa K(J_k|k)+K(k)+K(k_0) \leqa
k+2\lb k+2\lb k_0, \qmbox{hence}
\eeqn\vspace{-3ex}
\beqn\textstyle
H^k(\o) \;\leqm\; {\eps_k\over\eps_O\eps_{k_0}}\!\cdot\!k_0^2 k^2 2^k c_\o
\;\eqm\; k_0^8 2^{k_0}k^{-4}c_\o, \qmbox{where} c_\o:=2^{d_\mu(\o)}.
\eeqn
Chaining this bound via Lemma~\ref{lemHChain}$(ii)$ we get for $k_1>k_0$:
\bqan
\sum_{t=1}^n h_t(\hat\delta_{k_0},\hat\delta_{k_1})
&\leq& \sum_{t=1}^n 3 \!\!\sum_{k=k_0+1}^{k_1}\!\! (k\!-\!k_0\!+\!1)^2 h_t(\hat\delta_{k-1},\hat\delta_k)
\\
&\leq& 3 \!\!\sum_{k=k_0+1}^{k_1}\!\! k^2 H^k(\o)
\;\leqm\; 3k_0^8 2^{k_0}c_\o \!\!\sum_{k=k_0+1}^{k_1}\!\! k^{-2}
\;\leq\; 3k_0^7 2^{k_0}c_\o
\eqan
If we now take $k_1\to\infty$ we get $\sum_{t=1}^n
h_t(\hat\delta_{k_0},\hat D) \leqm 3k_0^7
2^{k_0+d_\mu(\o)}$. Finally let $n\to\infty$. \qed
The main properties allowing for proving $\hat D\to\mu$ were that
$\hat D$ is a measure with approximations $\hat\delta_k$, which
are computable in a certain sense. $\hat D$ is a mixture over all
enumerable/computable measures and hence incomputable.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{M.L.-Converging Enumerable Semimeasure $W$}\label{secNUESW}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The next step is to enlarge the class of computable measures to an
enumerable class of semimeasures, which are still sufficiently
close to measures in order not to spoil the convergence result.
For convergence w.p.1.\ we could include {\em all} semimeasures
(Theorem~\ref{thConv}). M.L.-convergence seems to require a more
restricted class. Included non-measures need to be zero on long
strings. We define quasimeasures as nearly normalized measures on $X^{\leq
n}$.
%------------------------------%
\fdefinition{defQM}{Quasimeasures}{
%------------------------------%
$\tilde\nu:\X^*\to\SetR_+$ is called a quasimeasure {\em iff}
$\tilde\nu$ is a measure or:
$\sum_{a\in\X}\tilde\nu(xa)=\tilde\nu(x)$ for $\l(x)n$ and $1-{1\over
n}<\tilde\nu(\epstr)\leq 1$, for some $n\in\SetN$.
}%------------------------------%
%------------------------------%
\flemma{lemQM}{Quasimeasures}{
%------------------------------%
$(i)$ A quasimeasure is either a semimeasure which is zero on long strings -or- a measure.
$(ii)$ The set of enumerable quasimeasures is enumerable and contains all
computable measures.
}%------------------------------%
For enumerability it is important to include the measures in the
definition of quasimeasures. One way of enumeration would be
to enumerate all enumerable partial functions $f$ and convert them
to quasimeasures. Since we need a correspondence to
semimeasures, we convert a semimeasure $\nu$ directly
to a maximal quasimeasure $\tilde\nu\leq\nu$.
\paradot{Proof \& construction}
$(i)$ Obvious from Definition~\ref{defQM}.
$(ii)$ Let $\nu$ be an enumerable semimeasure enumerated by
$\nu^t\!\!\!\nearrow\!\nu$. Consider $m\equiv m^t := \max\{n\leq
t\,:\,\sum_{x_{1:n}}\nu^t(x_{1:n})>1-{1\over n}\}$. $m^t$ is finite
and monotone increasing in $t$. We define the
quasimeasure
\beqn
\rho^t(x_{1:n}) := \sum_{x_{n+1:m}\in\X^{m-n}}\nu^t(x_{1:m})
\qmbox{for} n\leq m \qmbox{and} \rho^t(x_{1:n})=0
\qmbox{for} n>m.
\eeqn
We define an increasing sequence in $t$ of quasimeasures
$\tilde\nu^t\leq\nu^t$ for $t=1,2,...$ recursively
starting with $\tilde\nu^0:=0$ as follows:
\beqn
\mbox{If $\rho^t(x_{1:n})\geq\tilde\nu^{t-1}(x_{1:n})$
$\forall x_{1:n}\forall n\leq m^t$ (and hence $\forall x$), then
$\tilde\nu^t:=\rho^t$, else $\tilde\nu^t:=\tilde\nu^{t-1}$.}
\eeqn
$\tilde\nu:=\lim_{t\to\infty}\tilde\nu^t$ is an enumerable
quasimeasure. Note that $m^\infty=\infty$ iff $\nu$ is a measure.
One can easily verify that $\tilde\nu\leq\nu$ and
$\tilde\nu\equiv\nu$ iff $\nu$ is a quasimeasure. This
implies that if $\nu_1,\nu_2,...$ is an enumeration of all
enumerable semimeasures, then $\tilde\nu_1,\tilde\nu_2,...$ is an
enumeration of all enumerable quasimeasures.
\qed
Let $\tilde\nu_1,\tilde\nu_2,...$ be the enumeration of all
enumerable quasimeasures constructed in the proof of Lemma
\ref{lemQM}, based on the enumeration of all enumerable
semimeasures $\nu_1,\nu_2,...$ with the property that
$\tilde\nu_i\leq\nu_i$ and equality holds if $\nu_i$ is a
(quasi)measure. We define the enumerable semimeasure
\beqn
W(x):=\sum_{i=1}^\infty\eps_i\tilde\nu_i(x),
\qmbox{and note that}
D(x)=\sum_{i\in J} \eps_i\tilde\nu_i(x)
\;\;\mbox{with}\;\;
J:=\{i:\tilde\nu_i\mbox{ is measure}\}
\eeqn
with $\eps_i=i^{-6}2^{-i}$ as before.
To show $W\to D$ we need the following Lemma.
%------------------------------%
\flemma{lemHCont}{Hellinger Continuity}{
%------------------------------%
For $h_x(\mu,\nu):=\sum_{a\in\X}(\sqrt{\mu(a|x)}-\sqrt{\nu(a|x)})^2$,
where $\rho(y)=\mu(y)+\nu(y)$ $\forall y\in\X^*$ and $\mu$ and $\nu$
are semimeasures, it holds:
\beqn
\begin{array}{rl}
i) & h_x(\mu,\rho) \;\leq\; {\nu(x)\over\mu(x)}. \\[2mm]
ii) & h_x(\mu,\rho) \;\leq\; \odf\eps^2 \qmbox{if} \nu(x)\leq\eps\!\cdot\!\mu(x)
\qmbox{and} \nu(xb)\leq\eps\!\cdot\!\mu(xb) \; \forall b\in\X. \\
\end{array}
\eeqn
}%------------------------------%
$(ii)$ Since the Hellinger distance is locally quadratic, $h_x(\mu,\rho)$
scales quadratic in the deviation of predictor $\rho$ from $\mu$.
$(i)$ Closeness of $\rho(x)$ to $\mu(x)$ only, does not imply closeness
of the predicitons, hence only a bound linear in the deviation is possible.
\paradot{Proof}
$(i)$ We identify $\X\cong\{1,...,N\}$ and define $y_i=\mu(xi)$,
$z_i=\nu(xi)$, $y=\mu(x)$, and $z=\nu(x)$. We extend
$(y_i)_{i=1}^N$ to a probability by defining $y_0=y-\sum_{i=1}^N
y_i\geq 0$ and set $z_0=0$. Also $\eps':=z/y$.
Exploiting $\sum_{i=0}^N y_i = y$ and
$\sum_{i=0}^N z_i\leq z$ and $z\leq\eps y$ and $y_i,z_i,y,z\geq 0$
we get
\beqn
h_x(\mu,\mu\!+\!\nu) \;\equiv\; \sum_{i=1}^N \Bigg(\sqrt{y_i\over y}-\sqrt{y_i\!+\!z_i\over y\!+\!z}\Bigg)^2
\;\leq\; \sum_{i=0}^N \Bigg(\sqrt{y_i\over y}-\sqrt{y_i\!+\!z_i\over y\!+\!z}\Bigg)^2
\eeqn\vspace{-1ex}
\beqn
\;=\; \sum_{i=0}^N \Bigg({y_i\over y}+{y_i\!+\!z_i\over y\!+\!z}-2\sqrt{y_i(y_i\!+\!z_i)\over y(y\!+\!z)}\Bigg)
\;\leq\; 2-2\sum_{i=0}^N{y_i\over\sqrt{y(y\!+\!z)}}
\;=\; 2-{2\over\sqrt{1\!+\!\eps'}}
\;\leq\; \eps'.
\eeqn
$(ii)$ With the notation from $(i)$, additionally exploiting
$z_i\leq\eps y_i$ we get
\bqan
\sqrt{y_i\!+\!z_i\over y\!+\!z}-\sqrt{y_i\over y}
&\leq& {\sqrt{y_i\!+\!z_i}-\sqrt{y_i}\over\sqrt{y}}
\;\leq\; {\sqrt{y_i(1\!+\!\eps)}-\sqrt{y_i}\over\sqrt{y}}
\;\leq\; {\eps\over 2}\sqrt{y_i\over y}
\qquad\mbox{and}
\\
\sqrt{y_i\over y}-\sqrt{y_i\!+\!z_i\over y\!+\!z}
&=& {\sqrt{y_i(1\!+\!\eps')}-\sqrt{y_i+z_i}\over\sqrt{y(1\!+\!\eps')}}
\;\leq\; {\sqrt{y_i(1\!+\!\eps')}-\sqrt{y_i}\over\sqrt{y(1\!+\!\eps')}}
\;\leq\; {\eps'\over 2}\sqrt{y_i\over y}.
\eqan
Exploiting $\eps'\leq\eps$, taking the square and summing over $i$ proves $(ii)$.
\qed
%------------------------------%
\fproposition{proWtoD}{Convergence of enumerable $W$ to incomputable $D$}{
%------------------------------%
For every computable measure $\mu$ and for $\o$ being
$\mu$-random, the following holds for $t\to\infty$:
\beqn
(i) \;\; {W(\o_{1:t})\over D(\o_{1:t})}\to 1, \qquad
(ii) \;\; {W(\o_t|\o_{0$ if $\nu$ is a computable measure. If
$\nu$ is a ``proper'' quasimeasure, then
$\min_{x\in\X^*}{D(x)\over\nu(x)}=\min_{x:\l(x)\leq
m_\nu}{D(x)\over\nu(x)}>0$, since $\nu(x)=0$ for
$\l(x)>m_\nu<\infty$, and $D(x)>0\,\forall x$.
%
Second sentence:
It is well known that there is no computable semimeasure
dominating all computable measures (see e.g.\
\cite[Thm.4]{Hutter:03unipriors}), which shows that $D$, $\hat
D$ and $W$ cannot be computable.
%
We now show that $D$ and $W$ do not dominate the enumerable
semimeasure $M$ by extending this argument. Let $\nu$ be a
nowhere\footnote{$M$, $W$, $\hat D$, $D$, and $\delta_k$ for
$k\geq O(1)$ are nowhere zero. Alternatively one can verify that
all relevant assertions remain valid if $\nu$ is somewhere zero.}
zero computable semimeasure.
We define a computable sequence $\a$ as follows by induction:
Given $\a_{