From 4953d71b7a3cfe19c29f5155d0285ffb9d7b2025 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 11 Apr 2013 14:54:25 +0100
Subject: [PATCH 01/11] first trivial model touches

---
 GPy/examples/dimensionality_reduction.py | 71 +++++++++++++++++-------
 GPy/models/GPLVM.py                      |  4 +-
 GPy/models/__init__.py                   |  3 +
 3 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 61a4abd8..a82e0af4 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -6,26 +6,27 @@ import pylab as pb
 from matplotlib import pyplot as plt
 
 import GPy
+from GPy.models.mrd import MRD
 
 default_seed = np.random.seed(123344)
 
-def BGPLVM(seed = default_seed):
+def BGPLVM(seed=default_seed):
     N = 10
     M = 3
     Q = 2
     D = 4
-    #generate GPLVM-like data
+    # generate GPLVM-like data
     X = np.random.rand(N, Q)
     k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
     K = k.K(X)
-    Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
 
-    k = GPy.kern.linear(Q, ARD = True) + GPy.kern.white(Q)
+    k = GPy.kern.linear(Q, ARD=True) + GPy.kern.white(Q)
     # k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
     # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
     # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
 
-    m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
+    m = GPy.models.Bayesian_GPLVM(Y, Q, kernel=k, M=M)
     m.constrain_positive('(rbf|bias|noise|white|S)')
     # m.constrain_fixed('S', 1)
 
@@ -38,44 +39,44 @@ def BGPLVM(seed = default_seed):
     # pb.title('After optimisation')
     m.ensure_default_constraints()
     m.randomize()
-    m.checkgrad(verbose = 1)
+    m.checkgrad(verbose=1)
 
     return m
 
-def GPLVM_oil_100(optimize=True,M=15):
+def GPLVM_oil_100(optimize=True, M=15):
     data = GPy.util.datasets.oil_100()
 
     # create simple GP model
-    kernel = GPy.kern.rbf(6, ARD = True) + GPy.kern.bias(6)
+    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
     m = GPy.models.GPLVM(data['X'], 6, kernel=kernel, M=M)
     m.data_labels = data['Y'].argmax(axis=1)
 
     # optimize
     m.ensure_default_constraints()
     if optimize:
-        m.optimize('scg',messages=1)
+        m.optimize('scg', messages=1)
 
     # plot
     print(m)
     m.plot_latent(labels=m.data_labels)
     return m
 
-def BGPLVM_oil(optimize=True,N=100,Q=10,M=15):
+def BGPLVM_oil(optimize=True, N=100, Q=10, M=15):
     data = GPy.util.datasets.oil()
 
     # create simple GP model
-    kernel = GPy.kern.rbf(Q, ARD = True) + GPy.kern.bias(Q) + GPy.kern.white(Q,0.001)
-    m = GPy.models.Bayesian_GPLVM(data['X'][:N], Q, kernel = kernel,M=M)
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.001)
+    m = GPy.models.Bayesian_GPLVM(data['X'][:N], Q, kernel=kernel, M=M)
     m.data_labels = data['Y'][:N].argmax(axis=1)
 
     # optimize
     if optimize:
-        m.constrain_fixed('noise',0.05)
+        m.constrain_fixed('noise', 0.05)
         m.ensure_default_constraints()
-        m.optimize('scg',messages=1)
+        m.optimize('scg', messages=1)
         m.unconstrain('noise')
         m.constrain_positive('noise')
-        m.optimize('scg',messages=1)
+        m.optimize('scg', messages=1)
     else:
         m.ensure_default_constraints()
 
@@ -83,7 +84,7 @@ def BGPLVM_oil(optimize=True,N=100,Q=10,M=15):
     print(m)
     m.plot_latent(labels=m.data_labels)
     pb.figure()
-    pb.bar(np.arange(m.kern.D),1./m.input_sensitivity())
+    pb.bar(np.arange(m.kern.D), 1. / m.input_sensitivity())
     return m
 
 def oil_100():
@@ -96,7 +97,37 @@ def oil_100():
 
     # plot
     print(m)
-    #m.plot_latent(labels=data['Y'].argmax(axis=1))
+    # m.plot_latent(labels=data['Y'].argmax(axis=1))
+    return m
+
+def mrd_simulation():
+    num = 2
+    ard1 = np.array([1., 1, 0, 0], dtype=float)
+    ard2 = np.array([0., 1, 1, 0], dtype=float)
+    ard1[ard1 == 0] = 1E+10
+    ard2[ard2 == 0] = 1E+10
+
+    make_params = lambda ard: np.hstack([[1], ard, [1, .3]])
+
+    D1, D2, N, M, Q = 50, 100, 150, 15, 4
+    X = np.random.randn(N, Q)
+
+    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard1) + GPy.kern.bias(Q, 1) + GPy.kern.white(Q, 0.0001)
+    Y1 = np.random.multivariate_normal(np.zeros(N), k.K(X), D1).T
+    Y1 -= Y1.mean(0)
+
+    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard2) + GPy.kern.bias(Q, 1) + GPy.kern.white(Q, 0.0001)
+    Y2 = np.random.multivariate_normal(np.zeros(N), k.K(X), D2).T
+    Y2 -= Y2.mean(0)
+
+    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q, 1.0)
+
+    m = MRD(Y1, Y2, Q=Q, M=M, kernel=k, _debug=False)
+    m.ensure_default_constraints()
+
+    m.optimize(messages=1, max_f_eval=5000)
+
+    import ipdb;ipdb.set_trace()
     return m
 
 def brendan_faces():
@@ -109,7 +140,7 @@ def brendan_faces():
     m.optimize(messages=1, max_f_eval=10000)
 
     ax = m.plot_latent()
-    y = m.likelihood.Y[0,:]
+    y = m.likelihood.Y[0, :]
     data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
     lvm_visualizer = GPy.util.visualize.lvm(m, data_show, ax)
     raw_input('Press enter to finish')
@@ -120,13 +151,13 @@ def brendan_faces():
 def stick():
     data = GPy.util.datasets.stick()
     m = GPy.models.GPLVM(data['Y'], 2)
-    
+
     # optimize
     m.ensure_default_constraints()
     m.optimize(messages=1, max_f_eval=10000)
 
     ax = m.plot_latent()
-    y = m.likelihood.Y[0,:]
+    y = m.likelihood.Y[0, :]
     data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
     lvm_visualizer = GPy.util.visualize.lvm(m, data_show, ax)
     raw_input('Press enter to finish')
diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py
index cc4be70e..2b1002eb 100644
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@@ -55,7 +55,7 @@ class GPLVM(GP):
 
     def plot(self):
         assert self.likelihood.Y.shape[1]==2
-        pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet)
+        pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet)  # @UndefinedVariable
         Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
         mu, var, upper, lower = self.predict(Xnew)
         pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
@@ -90,7 +90,7 @@ class GPLVM(GP):
         Xtest_full[:, :2] = Xtest
         mu, var, low, up = self.predict(Xtest_full)
         var = var.mean(axis=1) # this was var[:, :2] edit by Neil
-        pb.imshow(var.reshape(resolution,resolution).T[::-1,:],extent=[xmin[0],xmax[0],xmin[1],xmax[1]],cmap=pb.cm.binary,interpolation='bilinear')
+        pb.imshow(var.reshape(resolution,resolution).T[::-1,:],extent=[xmin[0],xmax[0],xmin[1],xmax[1]],cmap=pb.cm.binary,interpolation='bilinear')  # @UndefinedVariable
 
 
         for i,ul in enumerate(np.unique(labels)):
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index f442dc67..91cc60e3 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -11,4 +11,7 @@ from warped_GP import warpedGP
 from sparse_GPLVM import sparse_GPLVM
 from uncollapsed_sparse_GP import uncollapsed_sparse_GP
 from Bayesian_GPLVM import Bayesian_GPLVM
+import mrd
+MRD = mrd.MRD
+del mrd
 from generalized_FITC import generalized_FITC

From 3593e9f5863164651ab5072ad138c7694ca1cf91 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 11 Apr 2013 15:02:26 +0100
Subject: [PATCH 02/11] mrd touches

---
 GPy/models/mrd.py        | 139 +++++++++++++++++++++++++++++++++++++++
 GPy/testing/mrd_tests.py |  32 +++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 GPy/models/mrd.py
 create mode 100644 GPy/testing/mrd_tests.py

diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
new file mode 100644
index 00000000..0ba4a695
--- /dev/null
+++ b/GPy/models/mrd.py
@@ -0,0 +1,139 @@
+'''
+Created on 10 Apr 2013
+
+@author: Max Zwiessele
+'''
+from GPy.core import model
+from GPy.models.Bayesian_GPLVM import Bayesian_GPLVM
+import numpy
+from GPy.models.sparse_GP import sparse_GP
+import itertools
+
+
+class MRD(model):
+    """
+    Do MRD on given Datasets in Ylist.
+    All Ys in Ylist are in [N x Dn], where Dn can be different per Yn,
+    N must be shared across datasets though.
+
+    :param Ylist...: observed datasets
+    :type Ylist: [np.ndarray]
+    :param names: names for different gplvm models
+    :type names: [str]
+    :param Q: latent dimensionality
+    :type Q: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+    :param X:
+        Initial latent space
+    :param X_variance:
+        Initial latent space variance
+    :param init: [PCA|random]
+        initialization method to use
+    :param M:
+        number of inducing inputs to use
+    :param Z:
+        initial inducing inputs
+    :param kernel:
+        kernel to use
+    """
+    def __init__(self, *Ylist, **kwargs):
+        self._debug = False
+        if kwargs.has_key("_debug"):
+            self._debug = kwargs['_debug']
+            del kwargs['_debug']
+        if kwargs.has_key("names"):
+            self.names = kwargs['names']
+            del kwargs['names']
+        else:
+            self.names = ["{}".format(i + 1) for i in range(len(Ylist))]
+        if kwargs.has_key('kernel'):
+            kernel = kwargs['kernel']
+            k = lambda: kernel.copy()
+            del kwargs['kernel']
+        else:
+            k = lambda: None
+        self.bgplvms = [Bayesian_GPLVM(Y, kernel=k(), **kwargs) for Y in Ylist]
+        self.gref = self.bgplvms[0]
+        nparams = numpy.array([0] + [sparse_GP._get_params(g).size - g.Z.size for g in self.bgplvms])
+        self.nparams = nparams.cumsum()
+        self.Q = self.gref.Q
+        self.N = self.gref.N
+        self.NQ = self.N * self.Q
+        self.M = self.gref.M
+        self.MQ = self.M * self.Q
+
+        model.__init__(self)  # @UndefinedVariable
+
+    def _get_param_names(self):
+        # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.Q)] for n in range(self.N)], [])
+        # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.Q)] for n in range(self.N)], [])
+        n1 = self.gref._get_param_names()
+        n1var = n1[:self.NQ * 2 + self.MQ]
+        map_names = lambda ns, name: map(lambda x: "{1}_{0}".format(*x),
+                                         itertools.izip(ns,
+                                                        itertools.repeat(name)))
+        return list(itertools.chain(n1var, *(map_names(\
+                sparse_GP._get_param_names(g)[self.MQ:], n) \
+                for g, n in zip(self.bgplvms, self.names))))
+
+    def _get_params(self):
+        """
+        return parameter list containing private and shared parameters as follows:
+        
+        =================================================================
+        | mu | S | Z || theta1 | theta2 | .. | thetaN |
+        =================================================================
+        """
+        X = self.gref.X.flatten()
+        X_var = self.gref.X_variance.flatten()
+        Z = self.gref.Z.flatten()
+        thetas = [sparse_GP._get_params(g)[g.Z.size:] for g in self.bgplvms]
+        params = numpy.hstack([X, X_var, Z, numpy.hstack(thetas)])
+        return params
+
+    def _set_var_params(self, g, X, X_var, Z):
+        g.X = X
+        g.X_variance = X_var
+        g.Z = Z
+
+    def _set_kern_params(self, g, p):
+        g.kern._set_params(p[:g.kern.Nparam])
+        g.likelihood._set_params(p[g.kern.Nparam:])
+
+    def _set_params(self, x):
+        start = 0; end = self.NQ
+        X = x[start:end].reshape(self.N, self.Q)
+        start = end; end += start
+        X_var = x[start:end].reshape(self.N, self.Q)
+        start = end; end += self.MQ
+        Z = x[start:end].reshape(self.M, self.Q)
+        thetas = x[end:]
+
+        # set params for all others:
+        for g, s, e in itertools.izip(self.bgplvms, self.nparams, self.nparams[1:]):
+            self._set_var_params(g, X, X_var, Z)
+            self._set_kern_params(g, thetas[s:e])
+            g._compute_kernel_matrices()
+            g._computations()
+
+
+    def log_likelihood(self):
+        ll = self.gref.KL_divergence()
+        for g in self.bgplvms:
+            ll += sparse_GP.log_likelihood(g)
+        return ll
+
+    def _log_likelihood_gradients(self):
+        dldmus = self.gref.dL_dmuS().flatten()
+        dldzt1 = sparse_GP._log_likelihood_gradients(self.gref)
+        return numpy.hstack((dldmus,
+                             dldzt1,
+                numpy.hstack([numpy.hstack([g.dL_dtheta(),
+                                            g.likelihood._gradients(\
+                                                partial=g.partial_for_likelihood)]) \
+                              for g in self.bgplvms[1:]])))
+
+    def plot_scales(self):
+        pass
+
diff --git a/GPy/testing/mrd_tests.py b/GPy/testing/mrd_tests.py
new file mode 100644
index 00000000..e25a1040
--- /dev/null
+++ b/GPy/testing/mrd_tests.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2013, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+'''
+Created on 10 Apr 2013
+
+@author: maxz
+'''
+
+import unittest
+import numpy as np
+import GPy
+
+# class MRDTests(unittest.TestCase):
+#
+#     # @unittest.skip('')
+#     def test_gradients(self):
+#         num_m = 2
+#         N, M, Q, D = 10, 3, 2, 4
+#         X = np.random.rand(N, Q)
+#         k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+#         K = k.K(X)
+#         Ylist = [np.random.multivariate_normal(np.zeros(N), K, D).T for _ in range(num_m)]
+#
+#         m = GPy.models.MRD(*Ylist, Q=Q, kernel=k, M=M)
+#         m._debug = True
+#         m.ensure_default_constraints()
+#         m.randomize()
+#         self.assertTrue(m.checkgrad())
+#
+# if __name__ == "__main__":
+#     print "Running unit tests, please be (very) patient..."
+#     # unittest.main()

From e99993b8ce0cd0b1ad3084a01dc593926a38222b Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 11 Apr 2013 15:11:41 +0100
Subject: [PATCH 03/11] GPLVM readded

---
 GPy/models/GPLVM.py | 125 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 GPy/models/GPLVM.py

diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py
new file mode 100644
index 00000000..32594594
--- /dev/null
+++ b/GPy/models/GPLVM.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import pylab as pb
+import sys, pdb
+from .. import kern
+from ..core import model
+from ..util.linalg import pdinv, PCA
+from GP import GP
+from ..likelihoods import Gaussian
+from .. import util
+
+class GPLVM(GP):
+    """
+    Gaussian Process Latent Variable Model
+
+    :param Y: observed data
+    :type Y: np.ndarray
+    :param Q: latent dimensionality
+    :type Q: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, Y, Q, init='PCA', X = None, kernel=None, **kwargs):
+        if X is None:
+            X = self.initialise_latent(init, Q, Y)
+        if kernel is None:
+            kernel = kern.rbf(Q) + kern.bias(Q)
+        likelihood = Gaussian(Y)
+        GP.__init__(self, X, likelihood, kernel, **kwargs)
+
+    def initialise_latent(self, init, Q, Y):
+        if init == 'PCA':
+            return PCA(Y, Q)[0]
+        else:
+            return np.random.randn(Y.shape[0], Q)
+
+    def _get_param_names(self):
+        return sum([['X_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[]) + GP._get_param_names(self)
+
+    def _get_params(self):
+        return np.hstack((self.X.flatten(), GP._get_params(self)))
+
+    def _set_params(self,x):
+        self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
+        GP._set_params(self, x[self.X.size:])
+
+    def _log_likelihood_gradients(self):
+        dL_dX = 2.*self.kern.dK_dX(self.dL_dK,self.X)
+
+        return np.hstack((dL_dX.flatten(),GP._log_likelihood_gradients(self)))
+
+    def plot(self):
+        assert self.likelihood.Y.shape[1]==2
+        pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet)
+        Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
+        mu, var, upper, lower = self.predict(Xnew)
+        pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
+
+    def plot_latent(self,labels=None, which_indices=None, resolution=50):
+        """
+        :param labels: a np.array of size self.N containing labels for the points (can be number, strings, etc)
+        :param resolution: the resolution of the grid on which to evaluate the predictive variance
+        """
+
+        util.plot.Tango.reset()
+        
+        if labels is None:
+            labels = np.ones(self.N)
+        if which_indices is None:
+            if self.Q==1:
+                input_1 = 0
+                input_2 = None
+            if self.Q==2:
+                input_1, input_2 = 0,1
+            else:
+                #try to find a linear of RBF kern in the kernel
+                k = [p for p in self.kern.parts if p.name in ['rbf','linear']]
+                if (not len(k)==1) or (not k[0].ARD):
+                    raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
+                k = k[0]
+                if k.name=='rbf':
+                    input_1, input_2 = np.argsort(k.lengthscale)[:2]
+                elif k.name=='linear':
+                    input_1, input_2 = np.argsort(k.variances)[::-1][:2]
+
+        #first, plot the output variance as a function of the latent space
+        Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(self.X[:,[input_1, input_2]],resolution=resolution)
+	Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1]))
+	Xtest_full[:, :2] = Xtest
+	mu, var, low, up = self.predict(Xtest_full)
+	var = var[:, :2]
+        pb.imshow(var.reshape(resolution,resolution).T[::-1,:],extent=[xmin[0],xmax[0],xmin[1],xmax[1]],cmap=pb.cm.binary,interpolation='bilinear')
+
+
+        for i,ul in enumerate(np.unique(labels)):
+            if type(ul) is np.string_:
+                this_label = ul
+            elif type(ul) is np.int64:
+                this_label = 'class %i'%ul
+            else:
+                this_label = 'class %i'%i
+
+            index = np.nonzero(labels==ul)[0]
+            if self.Q==1:
+                x = self.X[index,input_1]
+                y = np.zeros(index.size)
+            else:
+                x = self.X[index,input_1]
+                y = self.X[index,input_2]
+            pb.plot(x,y,marker='o',color=util.plot.Tango.nextMedium(),mew=0,label=this_label,linewidth=0)
+
+        pb.xlabel('latent dimension %i'%input_1)
+        pb.ylabel('latent dimension %i'%input_2)
+
+        if not np.all(labels==1.):
+            pb.legend(loc=0,numpoints=1)
+
+        pb.xlim(xmin[0],xmax[0])
+        pb.ylim(xmin[1],xmax[1])
+
+        return input_1, input_2

From dcbd76a6ee4a2d685000625806b81da2b441c591 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 11 Apr 2013 15:47:18 +0100
Subject: [PATCH 04/11] kern plotting with axisa

---
 GPy/examples/dimensionality_reduction.py | 26 +++++++++++++++---------
 GPy/kern/kern.py                         | 10 ++++-----
 GPy/models/mrd.py                        | 16 +++++++++------
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 248183ff..bfdc9bb6 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 import pylab as pb
-from matplotlib import pyplot as plt
+from matplotlib import pyplot as plt, pyplot
 
 import GPy
 from GPy.models.mrd import MRD
@@ -101,22 +101,25 @@ def oil_100():
     return m
 
 def mrd_simulation():
-    num = 2
+    # num = 2
     ard1 = np.array([1., 1, 0, 0], dtype=float)
     ard2 = np.array([0., 1, 1, 0], dtype=float)
-    ard1[ard1 == 0] = 1E+10
-    ard2[ard2 == 0] = 1E+10
+    ard1[ard1 == 0] = 1E-10
+    ard2[ard2 == 0] = 1E-10
 
-    make_params = lambda ard: np.hstack([[1], ard, [1, .3]])
+    ard1i = 1. / ard1
+    ard2i = 1. / ard2
+
+    # make_params = lambda ard: np.hstack([[1], ard, [1, .3]])
 
     D1, D2, N, M, Q = 50, 100, 150, 15, 4
     X = np.random.randn(N, Q)
 
-    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard1) + GPy.kern.bias(Q, 1) + GPy.kern.white(Q, 0.0001)
+    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard1i) + GPy.kern.bias(Q, 0) + GPy.kern.white(Q, 0.0001)
     Y1 = np.random.multivariate_normal(np.zeros(N), k.K(X), D1).T
     Y1 -= Y1.mean(0)
 
-    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard2) + GPy.kern.bias(Q, 1) + GPy.kern.white(Q, 0.0001)
+    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard2i) + GPy.kern.bias(Q, 0) + GPy.kern.white(Q, 0.0001)
     Y2 = np.random.multivariate_normal(np.zeros(N), k.K(X), D2).T
     Y2 -= Y2.mean(0)
 
@@ -125,9 +128,12 @@ def mrd_simulation():
     m = MRD(Y1, Y2, Q=Q, M=M, kernel=k, _debug=False)
     m.ensure_default_constraints()
 
-    m.optimize(messages=1, max_f_eval=5000)
+    fig = pyplot.figure("expected", figsize=(8, 3))
+    ax = fig.add_subplot(121)
+    ax.bar(np.arange(ard1.size) + .1, ard1)
+    ax = fig.add_subplot(122)
+    ax.bar(np.arange(ard2.size) + .1, ard2)
 
-    import ipdb;ipdb.set_trace()
     return m
 
 def brendan_faces():
@@ -175,7 +181,7 @@ def BGPLVM_oil():
     Q = 10
     M = 30
 
-    kernel = GPy.kern.rbf(Q, ARD = True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
     m = GPy.models.Bayesian_GPLVM(X, Q, kernel=kernel, M=M)
     # m.scale_factor = 100.0
     m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)')
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 7d3b1737..6f9c97f6 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -52,7 +52,7 @@ class kern(parameterised):
         parameterised.__init__(self)
 
 
-    def plot_ARD(self):
+    def plot_ARD(self, ax=pb.gca()):
         """
         If an ARD kernel is present, it bar-plots the ARD parameters
 
@@ -60,16 +60,16 @@ class kern(parameterised):
         """
         for p in self.parts:
             if hasattr(p, 'ARD') and p.ARD:
-                pb.figure()
-                pb.title('ARD parameters, %s kernel' % p.name)
+                ax.set_title('ARD parameters, %s kernel' % p.name)
 
                 if p.name == 'linear':
                     ard_params = p.variances
                 else:
                     ard_params = 1./p.lengthscale
 
-                pb.bar(np.arange(len(ard_params))-0.4, ard_params)
-
+                ax.bar(np.arange(len(ard_params)) - 0.4, ard_params)
+                ax.set_xticks(np.arange(len(ard_params)),
+                              ["${}$".format(i + 1) for i in range(len(ard_params))])
 
 
     def _transform_gradients(self,g):
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 0ba4a695..842ef42f 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -8,6 +8,8 @@ from GPy.models.Bayesian_GPLVM import Bayesian_GPLVM
 import numpy
 from GPy.models.sparse_GP import sparse_GP
 import itertools
+from matplotlib import pyplot
+import pylab
 
 
 class MRD(model):
@@ -103,17 +105,17 @@ class MRD(model):
 
     def _set_params(self, x):
         start = 0; end = self.NQ
-        X = x[start:end].reshape(self.N, self.Q)
+        X = x[start:end].reshape(self.N, self.Q).copy()
         start = end; end += start
-        X_var = x[start:end].reshape(self.N, self.Q)
+        X_var = x[start:end].reshape(self.N, self.Q).copy()
         start = end; end += self.MQ
-        Z = x[start:end].reshape(self.M, self.Q)
+        Z = x[start:end].reshape(self.M, self.Q).copy()
         thetas = x[end:]
 
         # set params for all others:
         for g, s, e in itertools.izip(self.bgplvms, self.nparams, self.nparams[1:]):
             self._set_var_params(g, X, X_var, Z)
-            self._set_kern_params(g, thetas[s:e])
+            self._set_kern_params(g, thetas[s:e].copy())
             g._compute_kernel_matrices()
             g._computations()
 
@@ -135,5 +137,7 @@ class MRD(model):
                               for g in self.bgplvms[1:]])))
 
     def plot_scales(self):
-        pass
-
+        fig = pylab.figure("MRD Scales", figsize=(4 * len(self.bgplvms), 3))
+        for i, g in enumerate(self.bgplvms):
+            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
+            g.kern.plot_ARD(ax=ax)

From de6b00ebfdee91c2ce106ccd414455e19ab79399 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 11 Apr 2013 16:00:14 +0100
Subject: [PATCH 05/11] plot_latent added for mrd

---
 GPy/models/GPLVM.py | 85 ++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 44 deletions(-)

diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py
index ca380a77..2ce55dda 100644
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@@ -24,7 +24,7 @@ class GPLVM(GP):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, Y, Q, init='PCA', X=None, kernel=None, **kwargs):
+    def __init__(self, Y, Q, init='PCA', X = None, kernel=None, **kwargs):
         if X is None:
             X = self.initialise_latent(init, Q, Y)
         if kernel is None:
@@ -39,28 +39,28 @@ class GPLVM(GP):
             return np.random.randn(Y.shape[0], Q)
 
     def _get_param_names(self):
-        return sum([['X_%i_%i' % (n, q) for q in range(self.Q)] for n in range(self.N)], []) + GP._get_param_names(self)
+        return sum([['X_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[]) + GP._get_param_names(self)
 
     def _get_params(self):
         return np.hstack((self.X.flatten(), GP._get_params(self)))
 
-    def _set_params(self, x):
-        self.X = x[:self.X.size].reshape(self.N, self.Q).copy()
+    def _set_params(self,x):
+        self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
         GP._set_params(self, x[self.X.size:])
 
     def _log_likelihood_gradients(self):
-        dL_dX = 2.*self.kern.dK_dX(self.dL_dK, self.X)
+        dL_dX = 2.*self.kern.dK_dX(self.dL_dK,self.X)
 
-        return np.hstack((dL_dX.flatten(), GP._log_likelihood_gradients(self)))
+        return np.hstack((dL_dX.flatten(),GP._log_likelihood_gradients(self)))
 
     def plot(self):
-        assert self.likelihood.Y.shape[1] == 2
-        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)
-        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
+        assert self.likelihood.Y.shape[1]==2
+        pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet)
+        Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
         mu, var, upper, lower = self.predict(Xnew)
-        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
+        pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
 
-    def plot_latent(self, labels=None, which_indices=None, resolution=50):
+    def plot_latent(self,labels=None, which_indices=None, resolution=50):
         """
         :param labels: a np.array of size self.N containing labels for the points (can be number, strings, etc)
         :param resolution: the resolution of the grid on which to evaluate the predictive variance
@@ -71,57 +71,54 @@ class GPLVM(GP):
         if labels is None:
             labels = np.ones(self.N)
         if which_indices is None:
-            if self.Q == 1:
+            if self.Q==1:
                 input_1 = 0
                 input_2 = None
-            if self.Q == 2:
-                input_1, input_2 = 0, 1
+            if self.Q==2:
+                input_1, input_2 = 0,1
             else:
-                # try to find a linear of RBF kern in the kernel
-                k = [p for p in self.kern.parts if p.name in ['rbf', 'linear']]
-                if (not len(k) == 1) or (not k[0].ARD):
+                try:
+                    input_1, input_2 = np.argsort(self.input_sensitivity())[:2]
+                except:
                     raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
-                k = k[0]
-                if k.name == 'rbf':
-                    input_1, input_2 = np.argsort(k.lengthscale)[:2]
-                elif k.name == 'linear':
-                    input_1, input_2 = np.argsort(k.variances)[::-1][:2]
+        else:
+            input_1, input_2 = which_indices
 
-        # first, plot the output variance as a function of the latent space
-        Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(self.X[:, [input_1, input_2]], resolution=resolution)
+        #first, plot the output variance as a function of the latent space
+        Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(self.X[:,[input_1, input_2]],resolution=resolution)
         Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1]))
         Xtest_full[:, :2] = Xtest
         mu, var, low, up = self.predict(Xtest_full)
-        var = var[:, :1]
-        pb.imshow(var.reshape(resolution, resolution).T[::-1, :],
-                  extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary, interpolation='bilinear')
+        var = var[:, :1] 
+        pb.imshow(var.reshape(resolution,resolution).T[::-1,:],
+                  extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary,interpolation='bilinear')
 
-        for i, ul in enumerate(np.unique(labels)):
+        for i,ul in enumerate(np.unique(labels)):
             if type(ul) is np.string_:
                 this_label = ul
             elif type(ul) is np.int64:
-                this_label = 'class %i' % ul
+                this_label = 'class %i'%ul
             else:
-                this_label = 'class %i' % i
+                this_label = 'class %i'%i
 
-            index = np.nonzero(labels == ul)[0]
-            if self.Q == 1:
-                x = self.X[index, input_1]
+            index = np.nonzero(labels==ul)[0]
+            if self.Q==1:
+                x = self.X[index,input_1]
                 y = np.zeros(index.size)
             else:
-                x = self.X[index, input_1]
-                y = self.X[index, input_2]
-            pb.plot(x, y, marker='o', color=util.plot.Tango.nextMedium(), mew=0, label=this_label, linewidth=0)
+                x = self.X[index,input_1]
+                y = self.X[index,input_2]
+            pb.plot(x,y,marker='o',color=util.plot.Tango.nextMedium(),mew=0,label=this_label,linewidth=0)
 
-        pb.xlabel('latent dimension %i' % input_1)
-        pb.ylabel('latent dimension %i' % input_2)
+        pb.xlabel('latent dimension %i'%input_1)
+        pb.ylabel('latent dimension %i'%input_2)
 
-        if not np.all(labels == 1.):
-            pb.legend(loc=0, numpoints=1)
+        if not np.all(labels==1.):
+            pb.legend(loc=0,numpoints=1)
 
-        pb.xlim(xmin[0], xmax[0])
-        pb.ylim(xmin[1], xmax[1])
-        pb.grid(b=False)  # remove the grid if present, it doesn't look good
+        pb.xlim(xmin[0],xmax[0])
+        pb.ylim(xmin[1],xmax[1])
+        pb.grid(b=False) # remove the grid if present, it doesn't look good
         ax = pb.gca()
-        ax.set_aspect('auto')  # set a nice aspect ratio
+        ax.set_aspect('auto') # set a nice aspect ratio
         return ax

From 51ff92e59151a08a05097565457e113f9090b311 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 11 Apr 2013 18:44:18 +0100
Subject: [PATCH 06/11] finished mrd and added plotting functions

---
 GPy/examples/dimensionality_reduction.py | 43 ++++++++++--------
 GPy/inference/SCG.py                     |  3 +-
 GPy/kern/kern.py                         |  2 +-
 GPy/models/Bayesian_GPLVM.py             | 44 +++++++++++--------
 GPy/models/GPLVM.py                      | 18 ++++----
 GPy/models/mrd.py                        | 55 ++++++++++++++++++++----
 GPy/testing/mrd_tests.py                 | 40 ++++++++---------
 7 files changed, 130 insertions(+), 75 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index c4e064a5..93ce374d 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -107,32 +107,41 @@ def mrd_simulation():
     ard1[ard1 == 0] = 1E-10
     ard2[ard2 == 0] = 1E-10
 
-    ard1i = 1. / ard1
-    ard2i = 1. / ard2
+#     ard1i = 1. / ard1
+#     ard2i = 1. / ard2
 
-    # make_params = lambda ard: np.hstack([[1], ard, [1, .3]])
+#     k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard1i) + GPy.kern.bias(Q, 0) + GPy.kern.white(Q, 0.0001)
+#     Y1 = np.random.multivariate_normal(np.zeros(N), k.K(X), D1).T
+#     Y1 -= Y1.mean(0)
+#
+#     k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard2i) + GPy.kern.bias(Q, 0) + GPy.kern.white(Q, 0.0001)
+#     Y2 = np.random.multivariate_normal(np.zeros(N), k.K(X), D2).T
+#     Y2 -= Y2.mean(0)
+#     make_params = lambda ard: np.hstack([[1], ard, [1, .3]])
 
     D1, D2, N, M, Q = 50, 100, 150, 15, 4
-    X = np.random.randn(N, Q)
+    x = np.linspace(0, 2 * np.pi, N)[:, None]
 
-    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard1i) + GPy.kern.bias(Q, 0) + GPy.kern.white(Q, 0.0001)
-    Y1 = np.random.multivariate_normal(np.zeros(N), k.K(X), D1).T
-    Y1 -= Y1.mean(0)
+    s1 = np.vectorize(lambda x: np.sin(x))
+    s2 = np.vectorize(lambda x: np.cos(x))
+    sS = np.vectorize(lambda x: np.sin(2 * x))
 
-    k = GPy.kern.rbf(Q, ARD=True, lengthscale=ard2i) + GPy.kern.bias(Q, 0) + GPy.kern.white(Q, 0.0001)
-    Y2 = np.random.multivariate_normal(np.zeros(N), k.K(X), D2).T
-    Y2 -= Y2.mean(0)
+    S1 = np.hstack([s1(x), sS(x)])
+    S2 = np.hstack([s2(x), sS(x)])
 
-    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q, 1.0)
+    Y1 = S1.dot(np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(np.random.randn(S2.shape[1], D2))
 
-    m = MRD(Y1, Y2, Q=Q, M=M, kernel=k, _debug=False)
+    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
+
+    m = MRD(Y1, Y2, Q=Q, M=M, kernel=k, init="PCA", _debug=False)
     m.ensure_default_constraints()
 
-    fig = pyplot.figure("expected", figsize=(8, 3))
-    ax = fig.add_subplot(121)
-    ax.bar(np.arange(ard1.size) + .1, ard1)
-    ax = fig.add_subplot(122)
-    ax.bar(np.arange(ard2.size) + .1, ard2)
+#     fig = pyplot.figure("expected", figsize=(8, 3))
+#     ax = fig.add_subplot(121)
+#     ax.bar(np.arange(ard1.size) + .1, ard1)
+#     ax = fig.add_subplot(122)
+#     ax.bar(np.arange(ard2.size) + .1, ard2)
 
     return m
 
diff --git a/GPy/inference/SCG.py b/GPy/inference/SCG.py
index 0e85f243..912641f6 100644
--- a/GPy/inference/SCG.py
+++ b/GPy/inference/SCG.py
@@ -104,7 +104,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
 
         iteration += 1
         if display:
-            print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            print 'Iteration: {0:<5g}  Objective:{1:< 12g}  Scale:{2:< 12g}\r'.format(iteration, fnow, beta),
+            # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
             sys.stdout.flush()
 
         if success:
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 6f9c97f6..447819c1 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -70,7 +70,7 @@ class kern(parameterised):
                 ax.bar(np.arange(len(ard_params)) - 0.4, ard_params)
                 ax.set_xticks(np.arange(len(ard_params)),
                               ["${}$".format(i + 1) for i in range(len(ard_params))])
-
+        return ax
 
     def _transform_gradients(self,g):
         x = self._get_params()
diff --git a/GPy/models/Bayesian_GPLVM.py b/GPy/models/Bayesian_GPLVM.py
index ee485e76..dd55b38f 100644
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@@ -22,7 +22,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, Y, Q, X = None, X_variance = None, init='PCA', M=10, Z=None, kernel=None, **kwargs):
+    def __init__(self, Y, Q, X=None, X_variance=None, init='PCA', M=10, Z=None, kernel=None, **kwargs):
         if X == None:
             X = self.initialise_latent(init, Q, Y)
 
@@ -31,7 +31,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
 
         if Z is None:
             Z = np.random.permutation(X.copy())[:M]
-        assert Z.shape[1]==X.shape[1]
+        assert Z.shape[1] == X.shape[1]
 
         if kernel is None:
             kernel = kern.rbf(Q) + kern.white(Q)
@@ -40,8 +40,8 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
         sparse_GP.__init__(self, X, Gaussian(Y), kernel, Z=Z, X_variance=X_variance, **kwargs)
 
     def _get_param_names(self):
-        X_names = sum([['X_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[])
-        S_names = sum([['X_variance_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[])
+        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.Q)] for n in range(self.N)], [])
+        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.Q)] for n in range(self.N)], [])
         return (X_names + S_names + sparse_GP._get_param_names(self))
 
     def _get_params(self):
@@ -56,35 +56,43 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
         """
         return np.hstack((self.X.flatten(), self.X_variance.flatten(), sparse_GP._get_params(self)))
 
-    def _set_params(self,x):
+    def _set_params(self, x):
         N, Q = self.N, self.Q
-        self.X = x[:self.X.size].reshape(N,Q).copy()
-        self.X_variance = x[(N*Q):(2*N*Q)].reshape(N,Q).copy()
-        sparse_GP._set_params(self, x[(2*N*Q):])
+        self.X = x[:self.X.size].reshape(N, Q).copy()
+        self.X_variance = x[(N * Q):(2 * N * Q)].reshape(N, Q).copy()
+        sparse_GP._set_params(self, x[(2 * N * Q):])
+
+
+    def dKL_dmuS(self):
+        dKL_dS = (1. - (1. / self.X_variance)) * 0.5
+        dKL_dmu = self.X
+        return dKL_dmu, dKL_dS
 
     def dL_dmuS(self):
-        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_variance)
-        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi0_dmuS(self.dL_dpsi0,self.Z,self.X,self.X_variance)
-        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2,self.Z,self.X,self.X_variance)
+        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance)
+        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance)
+        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance)
         dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
         dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
 
-        dKL_dS = (1. - (1./self.X_variance))*0.5
-        dKL_dmu = self.X
-        return np.hstack(((dL_dmu - dKL_dmu).flatten(), (dL_dS - dKL_dS).flatten()))
+        return dL_dmu, dL_dS
 
     def KL_divergence(self):
         var_mean = np.square(self.X).sum()
         var_S = np.sum(self.X_variance - np.log(self.X_variance))
-        return 0.5*(var_mean + var_S) - 0.5*self.Q*self.N
+        return 0.5 * (var_mean + var_S) - 0.5 * self.Q * self.N
 
     def log_likelihood(self):
         return sparse_GP.log_likelihood(self) - self.KL_divergence()
 
     def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dmuS().flatten(), sparse_GP._log_likelihood_gradients(self)))
+        dKL_dmu, dKL_dS = self.dKL_dmuS()
+        dL_dmu, dL_dS = self.dL_dmuS()
+        # TODO: find way to make faster
+        dbound_dmuS = np.hstack(((dL_dmu - dKL_dmu).flatten(), (dL_dS - dKL_dS).flatten()))
+        return np.hstack((dbound_dmuS.flatten(), sparse_GP._log_likelihood_gradients(self)))
 
-    def plot_latent(self, which_indices=None,*args, **kwargs):
+    def plot_latent(self, which_indices=None, *args, **kwargs):
 
         if which_indices is None:
             try:
@@ -93,6 +101,6 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
                 raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
         else:
             input_1, input_2 = which_indices
-        ax = GPLVM.plot_latent(self, which_indices=[input_1, input_2],*args, **kwargs)
+        ax = GPLVM.plot_latent(self, which_indices=[input_1, input_2], *args, **kwargs)
         ax.plot(self.Z[:, input_1], self.Z[:, input_2], '^w')
         return ax
diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py
index 2ce55dda..0c81a9c1 100644
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@@ -60,7 +60,7 @@ class GPLVM(GP):
         mu, var, upper, lower = self.predict(Xnew)
         pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
 
-    def plot_latent(self,labels=None, which_indices=None, resolution=50):
+    def plot_latent(self, labels=None, which_indices=None, resolution=50, ax=pb.gca()):
         """
         :param labels: a np.array of size self.N containing labels for the points (can be number, strings, etc)
         :param resolution: the resolution of the grid on which to evaluate the predictive variance
@@ -90,7 +90,7 @@ class GPLVM(GP):
         Xtest_full[:, :2] = Xtest
         mu, var, low, up = self.predict(Xtest_full)
         var = var[:, :1] 
-        pb.imshow(var.reshape(resolution,resolution).T[::-1,:],
+        ax.imshow(var.reshape(resolution, resolution).T[::-1, :],
                   extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary,interpolation='bilinear')
 
         for i,ul in enumerate(np.unique(labels)):
@@ -110,15 +110,15 @@ class GPLVM(GP):
                 y = self.X[index,input_2]
             pb.plot(x,y,marker='o',color=util.plot.Tango.nextMedium(),mew=0,label=this_label,linewidth=0)
 
-        pb.xlabel('latent dimension %i'%input_1)
-        pb.ylabel('latent dimension %i'%input_2)
+        ax.set_xlabel('latent dimension %i' % input_1)
+        ax.set_ylabel('latent dimension %i' % input_2)
 
         if not np.all(labels==1.):
-            pb.legend(loc=0,numpoints=1)
+            ax.legend(loc=0, numpoints=1)
 
-        pb.xlim(xmin[0],xmax[0])
-        pb.ylim(xmin[1],xmax[1])
-        pb.grid(b=False) # remove the grid if present, it doesn't look good
-        ax = pb.gca()
+        ax.set_xlim(xmin[0], xmax[0])
+        ax.set_ylim(xmin[1], xmax[1])
+        ax.grid(b=False)  # remove the grid if present, it doesn't look good
+        # ax = pb.gca()
         ax.set_aspect('auto') # set a nice aspect ratio
         return ax
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 842ef42f..bd1c3528 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -121,23 +121,60 @@ class MRD(model):
 
 
     def log_likelihood(self):
-        ll = self.gref.KL_divergence()
+        ll = +self.gref.KL_divergence()
         for g in self.bgplvms:
-            ll += sparse_GP.log_likelihood(g)
-        return ll
+            ll -= sparse_GP.log_likelihood(g)
+        return -ll
 
     def _log_likelihood_gradients(self):
-        dldmus = self.gref.dL_dmuS().flatten()
-        dldzt1 = sparse_GP._log_likelihood_gradients(self.gref)
-        return numpy.hstack((dldmus,
+        dLdmu, dLdS = reduce(lambda a, b: [a[0] + b[0], a[1] + b[1]], (g.dL_dmuS() for g in self.bgplvms))
+        dKLmu, dKLdS = self.gref.dKL_dmuS()
+        dLdmu -= dKLmu
+        dLdS -= dKLdS
+        dLdmuS = numpy.hstack((dLdmu.flatten(), dLdS.flatten())).flatten()
+        dldzt1 = reduce(lambda a, b: a + b, (sparse_GP._log_likelihood_gradients(g)[:self.MQ] for g in self.bgplvms))
+
+        return numpy.hstack((dLdmuS,
                              dldzt1,
                 numpy.hstack([numpy.hstack([g.dL_dtheta(),
                                             g.likelihood._gradients(\
                                                 partial=g.partial_for_likelihood)]) \
-                              for g in self.bgplvms[1:]])))
+                              for g in self.bgplvms])))
 
-    def plot_scales(self):
+    def plot_X(self):
+        fig = pylab.figure("MRD X", figsize=(4 * len(self.bgplvms), 3))
+        fig.clf()
+        for i, g in enumerate(self.bgplvms):
+            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
+            ax.imshow(g.X)
+        pylab.draw()
+        fig.tight_layout()
+        return fig
+
+    def plot_predict(self):
+        fig = pylab.figure("MRD Predictions", figsize=(4 * len(self.bgplvms), 3))
+        fig.clf()
+        for i, g in enumerate(self.bgplvms):
+            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
+            ax.imshow(g.predict(g.X)[0])
+        pylab.draw()
+        fig.tight_layout()
+        return fig
+
+    def plot_scales(self, *args, **kwargs):
         fig = pylab.figure("MRD Scales", figsize=(4 * len(self.bgplvms), 3))
         for i, g in enumerate(self.bgplvms):
             ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
-            g.kern.plot_ARD(ax=ax)
+            g.kern.plot_ARD(ax=ax, *args, **kwargs)
+        pylab.draw()
+        fig.tight_layout()
+        return fig
+
+    def plot_latent(self, *args, **kwargs):
+        fig = pylab.figure("MRD Latent Spaces", figsize=(4 * len(self.bgplvms), 3))
+        for i, g in enumerate(self.bgplvms):
+            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
+            g.plot_latent(ax=ax, *args, **kwargs)
+        pylab.draw()
+        fig.tight_layout()
+        return fig
diff --git a/GPy/testing/mrd_tests.py b/GPy/testing/mrd_tests.py
index e25a1040..d90a7430 100644
--- a/GPy/testing/mrd_tests.py
+++ b/GPy/testing/mrd_tests.py
@@ -10,23 +10,23 @@ import unittest
 import numpy as np
 import GPy
 
-# class MRDTests(unittest.TestCase):
-#
-#     # @unittest.skip('')
-#     def test_gradients(self):
-#         num_m = 2
-#         N, M, Q, D = 10, 3, 2, 4
-#         X = np.random.rand(N, Q)
-#         k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
-#         K = k.K(X)
-#         Ylist = [np.random.multivariate_normal(np.zeros(N), K, D).T for _ in range(num_m)]
-#
-#         m = GPy.models.MRD(*Ylist, Q=Q, kernel=k, M=M)
-#         m._debug = True
-#         m.ensure_default_constraints()
-#         m.randomize()
-#         self.assertTrue(m.checkgrad())
-#
-# if __name__ == "__main__":
-#     print "Running unit tests, please be (very) patient..."
-#     # unittest.main()
+class MRDTests(unittest.TestCase):
+
+    def test_gradients(self):
+        num_m = 3
+        N, M, Q, D = 20, 8, 5, 50
+        X = np.random.rand(N, Q)
+
+        k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q)
+        K = k.K(X)
+        Ylist = [np.random.multivariate_normal(np.zeros(N), K, D).T for _ in range(num_m)]
+
+        m = GPy.models.MRD(*Ylist, Q=Q, kernel=k, M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()

From 338f3295b15025e4abd9332042e53f80e9600c06 Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Fri, 12 Apr 2013 13:31:15 +0100
Subject: [PATCH 07/11] now returning the ax for plot_latent in BGPLVM

---
 GPy/models/Bayesian_GPLVM.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/models/Bayesian_GPLVM.py b/GPy/models/Bayesian_GPLVM.py
index ba9603bb..aaaefa7f 100644
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@@ -95,3 +95,5 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
             input_1, input_2 = which_indices
         ax = GPLVM.plot_latent(self, which_indices=[input_1, input_2],*args, **kwargs)
         ax.plot(self.Z[:, input_1], self.Z[:, input_2], '^w')
+
+        return ax

From c7e8345c9614300fbfe20102026864df0756af5b Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Fri, 12 Apr 2013 13:31:45 +0100
Subject: [PATCH 08/11] --march=native was causing problems on the stupid
 compiler on MacOS

---
 GPy/kern/rbf.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/GPy/kern/rbf.py b/GPy/kern/rbf.py
index a26bb79c..ff5d6ff3 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@@ -228,9 +228,8 @@ class rbf(kernpart):
 
     def weave_psi2(self,mu,Zhat):
         weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -march=native'],
-                         'extra_link_args'   : ['-lgomp'],
-                         'compiler'          : 'gcc'}
+                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
+                         'extra_link_args'   : ['-lgomp']}
 
         N,Q = mu.shape
         M = Zhat.shape[0]

From ffa1879cfc7cb93b7921e33ce11029bd5efa7f8c Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Fri, 12 Apr 2013 13:32:18 +0100
Subject: [PATCH 09/11] added automatic scale_factor to sparse GPs

---
 GPy/models/sparse_GP.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py
index 88abf77d..cebcba0b 100644
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@@ -36,7 +36,7 @@ class sparse_GP(GP):
 
     def __init__(self, X, likelihood, kernel, Z, X_variance=None, Xslices=None,Zslices=None, normalize_X=False):
         self.scale_factor = 100.0# a scaling factor to help keep the algorithm stable
-
+        self.auto_scale_factor = False
         self.Z = Z
         self.Zslices = Zslices
         self.Xslices = Xslices
@@ -184,6 +184,8 @@ class sparse_GP(GP):
         self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
         self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
         self._compute_kernel_matrices()
+        if self.auto_scale_factor:
+            self.scale_factor = np.sqrt(self.psi2.sum(0).mean()*self.likelihood.precision)
         self._computations()
 
     def _get_params(self):

From c8d64a4a69bd193833ac61c3fad721b6cdb3eecc Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Fri, 12 Apr 2013 13:32:27 +0100
Subject: [PATCH 10/11] minor changes

---
 GPy/examples/dimensionality_reduction.py |  2 +-
 GPy/inference/SGD.py                     | 53 ++++++++----------------
 2 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 61a4abd8..b8c60a09 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -120,7 +120,7 @@ def brendan_faces():
 def stick():
     data = GPy.util.datasets.stick()
     m = GPy.models.GPLVM(data['Y'], 2)
-    
+
     # optimize
     m.ensure_default_constraints()
     m.optimize(messages=1, max_f_eval=10000)
diff --git a/GPy/inference/SGD.py b/GPy/inference/SGD.py
index 13a325b0..5d1b673d 100644
--- a/GPy/inference/SGD.py
+++ b/GPy/inference/SGD.py
@@ -75,7 +75,10 @@ class opt_SGD(Optimizer):
         return (np.isnan(data).sum(axis=1) == 0)
 
     def check_for_missing(self, data):
-        return np.isnan(data).sum() > 0
+        if sp.sparse.issparse(self.model.likelihood.Y):
+            return True
+        else:
+            return np.isnan(data).sum() > 0
 
     def subset_parameter_vector(self, x, samples, param_shapes):
         subset = np.array([], dtype = int)
@@ -149,10 +152,10 @@ class opt_SGD(Optimizer):
         else:
             raise NotImplementedError
 
-    def step_with_missing_data(self, f_fp, X, step, shapes, sparse_matrix):
+    def step_with_missing_data(self, f_fp, X, step, shapes):
         N, Q = X.shape
 
-        if not sparse_matrix:
+        if not sp.sparse.issparse(self.model.likelihood.Y):
             Y = self.model.likelihood.Y
             samples = self.non_null_samples(self.model.likelihood.Y)
             self.model.N = samples.sum()
@@ -165,7 +168,6 @@ class opt_SGD(Optimizer):
         if self.model.N == 0 or Y.std() == 0.0:
             return 0, step, self.model.N
 
-        # FIXME: get rid of self.center, everything should be centered by default
         self.model.likelihood._mean = Y.mean()
         self.model.likelihood._std = Y.std()
         self.model.likelihood.set_data(Y)
@@ -173,10 +175,6 @@ class opt_SGD(Optimizer):
         j = self.subset_parameter_vector(self.x_opt, samples, shapes)
         self.model.X = X[samples]
 
-        # if self.center:
-        #     self.model.likelihood.Y -= self.model.likelihood.Y.mean()
-        #     self.model.likelihood.Y /= self.model.likelihood.Y.std()
-
         model_name = self.model.__class__.__name__
 
         if model_name == 'Bayesian_GPLVM':
@@ -185,33 +183,31 @@ class opt_SGD(Optimizer):
 
         b, p = self.shift_constraints(j)
         f, fp = f_fp(self.x_opt[j])
-        # momentum_term = self.momentum * step[j]
-        # step[j] = self.learning_rate[j] * fp
-        # self.x_opt[j] -= step[j] + momentum_term
-
         step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
         self.x_opt[j] -= step[j]
 
         self.restore_constraints(b, p)
+        # restore likelihood _mean and _std, otherwise when we call set_data(y) on
+        # the next feature, it will get normalized with the mean and std of this one.
+        self.model.likelihood._mean = 0
+        self.model.likelihood._std = 1
 
         return f, step, self.model.N
 
     def opt(self, f_fp=None, f=None, fp=None):
         self.x_opt = self.model._get_params_transformed()
         X, Y = self.model.X.copy(), self.model.likelihood.Y.copy()
-        N, Q = self.model.X.shape
-        D = self.model.likelihood.Y.shape[1]
-        self.trace = []
-        sparse_matrix = sp.sparse.issparse(self.model.likelihood.Y)
-        missing_data = True
-        if not sparse_matrix:
-            missing_data = self.check_for_missing(self.model.likelihood.Y)
 
         self.model.likelihood.YYT = None
         self.model.likelihood.trYYT = None
         self.model.likelihood._mean = 0.0
         self.model.likelihood._std = 1.0
+
+        N, Q = self.model.X.shape
+        D = self.model.likelihood.Y.shape[1]
         num_params = self.model._get_params()
+        self.trace = []
+        missing_data = self.check_for_missing(self.model.likelihood.Y)
 
         step = np.zeros_like(num_params)
         for it in range(self.iterations):
@@ -224,34 +220,26 @@ class opt_SGD(Optimizer):
             b = len(features)/self.batch_size
             features = [features[i::b] for i in range(b)]
             NLL = []
-            count = 0
-            last_printed_count = -1
 
-            for j in features:
-                count += 1
+            for count, j in enumerate(features):
                 self.model.D = len(j)
                 self.model.likelihood.D = len(j)
                 self.model.likelihood.set_data(Y[:, j])
 
-                if missing_data or sparse_matrix:
+                if missing_data:
                     shapes = self.get_param_shapes(N, Q)
-                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes, sparse_matrix)
+                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
                 else:
                     Nj = N
                     f, fp = f_fp(self.x_opt)
-                    # momentum_term = self.momentum * step # compute momentum using update(t-1)
-                    # step = self.learning_rate * fp # compute update(t)
-                    # self.x_opt -= step + momentum_term
                     step = self.momentum * step + self.learning_rate * fp
                     self.x_opt -= step
 
-
                 if self.messages == 2:
                     noise = self.model.likelihood._variance
                     status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
                     sys.stdout.write(status)
                     sys.stdout.flush()
-                    last_printed_count = count
                     self.param_traces['noise'].append(noise)
                 NLL.append(f)
 
@@ -269,7 +257,6 @@ class opt_SGD(Optimizer):
             self.model.likelihood.D = D
             self.model.likelihood.Y = Y
 
-            # self.model.Youter = np.dot(Y, Y.T)
             self.trace.append(self.f_opt)
             if self.iteration_file is not None:
                 f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
@@ -282,7 +269,3 @@ class opt_SGD(Optimizer):
                 status = "SGD Iteration: {0: 3d}/{1: 3d}  f: {2: 2.3f}\n".format(it+1, self.iterations, self.f_opt)
                 sys.stdout.write(status)
                 sys.stdout.flush()
-
-
-
-

From f6b98160a7c0ace6ca5f795aeb878d30b8aaf6a4 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Fri, 12 Apr 2013 18:45:14 +0100
Subject: [PATCH 11/11] auto_scale

---
 GPy/models/sparse_GP.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py
index cebcba0b..b816e684 100644
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@@ -185,7 +185,11 @@ class sparse_GP(GP):
         self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
         self._compute_kernel_matrices()
         if self.auto_scale_factor:
-            self.scale_factor = np.sqrt(self.psi2.sum(0).mean()*self.likelihood.precision)
+            if self.likelihood.is_heteroscedastic:
+                self.scale_factor = max(100.,(self.psi2_beta_scaled.sum(0).max()))
+                print self.scale_factor
+            else:
+                self.scale_factor = np.sqrt(self.psi2.sum(0).mean()*self.likelihood.precision)
         self._computations()
 
     def _get_params(self):