From 2fb86f9b518f2398ff500569a5819adb8e5bcffa Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Wed, 13 Aug 2014 14:10:44 +0100
Subject: [PATCH 01/14] fix the bug of caching w.r.t. ignore arguments

---
 GPy/util/caching.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index bce51067..e15fca9c 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -33,13 +33,15 @@ class Cacher(object):
         """returns the self.id of an object, to be used in caching individual self.ids"""
         return hex(id(obj))
 
-    def combine_inputs(self, args, kw):
+    def combine_inputs(self, args, kw, ignore_args):
         "Combines the args and kw in a unique way, such that ordering of kwargs does not lead to recompute"
-        return args + tuple(c[1] for c in sorted(kw.items(), key=lambda x: x[0]))
+        inputs= args + tuple(c[1] for c in sorted(kw.items(), key=lambda x: x[0]))
+        # REMOVE the ignored arguments from input and PREVENT it from being checked!!!
+        return [a for i,a in enumerate(inputs) if i not in ignore_args]
 
-    def prepare_cache_id(self, combined_args_kw, ignore_args):
-        "get the cacheid (conc. string of argument self.ids in order) ignoring ignore_args"
-        cache_id = "".join(self.id(a) for i, a in enumerate(combined_args_kw) if i not in ignore_args)
+    def prepare_cache_id(self, combined_args_kw):
+        "get the cacheid (conc. string of argument self.ids in order)"
+        cache_id = "".join(self.id(a) for a in combined_args_kw)
         return cache_id
 
     def ensure_cache_length(self, cache_id):
@@ -95,10 +97,12 @@ class Cacher(object):
                     return self.operation(*args, **kw)
 
         # 2: prepare_cache_id and get the unique self.id string for this call
-        inputs = self.combine_inputs(args, kw)
-        cache_id = self.prepare_cache_id(inputs, self.ignore_args)
+        inputs = self.combine_inputs(args, kw, self.ignore_args)
+        cache_id = self.prepare_cache_id(inputs)
         # 2: if anything is not cachable, we will just return the operation, without caching
         if reduce(lambda a, b: a or (not (isinstance(b, Observable) or b is None)), inputs, False):
+            #print 'WARNING: '+self.operation.__name__ + ' not cacheable!'
+            #print [not (isinstance(b, Observable)) for b in inputs]
             return self.operation(*args, **kw)
         # 3&4: check whether this cache_id has been cached, then has it changed?
         try:

From 5f863e8ca29d8b94c79aeb126c85dbf641722720 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Fri, 15 Aug 2014 08:39:14 -0700
Subject: [PATCH 02/14] [parameterization] Parameter adding more robust and
 better error handling

---
 GPy/core/parameterization/parameter_core.py | 17 ++++++++++++++---
 GPy/core/parameterization/parameterized.py  |  5 ++++-
 GPy/core/parameterization/variational.py    |  2 +-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 0ecc1ebf..4edb0520 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -954,19 +954,30 @@ class Parameterizable(OptimizationHandlable):
         if ignore_added_names:
             self.__dict__[pname] = param
             return
+
+        def warn_and_retry():
+            print """
+            WARNING: added a parameter with formatted name {},
+            which is already assigned to {}.
+            Trying to change the parameter name to
+
+            {}.{}
+            """.format(pname, self.hierarchy_name(), self.hierarchy_name(), param.name + "_")
+            param.name += "_"
+            self._add_parameter_name(param, ignore_added_names)
         # and makes sure to not delete programmatically added parameters
         if pname in self.__dict__:
             if not (param is self.__dict__[pname]):
                 if pname in self._added_names_:
                     del self.__dict__[pname]
                     self._add_parameter_name(param)
+                else:
+                    warn_and_retry()
         elif pname not in dir(self):
             self.__dict__[pname] = param
             self._added_names_.add(pname)
         else:
-            print "WARNING: added a parameter with formatted name {}, which is already a member of {} object. Trying to change the parameter name to\n   {}".format(pname, self.__class__, param.name + "_")
-            param.name += "_"
-            self._add_parameter_name(param, ignore_added_names)
+            warn_and_retry()
 
     def _remove_parameter_name(self, param=None, pname=None):
         assert param is None or pname is None, "can only delete either param by name, or the name of a param"
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index eabd5a9c..e036d680 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -180,7 +180,10 @@ class Parameterized(Parameterizable):
         :param param: param object to remove from being a parameter of this parameterized object.
         """
         if not param in self.parameters:
-            raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+            try:
+                raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+            except AttributeError:
+                raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param))
 
         start = sum([p.size for p in self.parameters[:param._parent_index_]])
         self._remove_parameter_name(param)
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index cf8d3067..9e0127a2 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -111,7 +111,7 @@ class VariationalPosterior(Parameterized):
             n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
             return n
         else:
-            return super(VariationalPrior, self).__getitem__(s)
+            return super(VariationalPosterior, self).__getitem__(s)
 
 class NormalPosterior(VariationalPosterior):
     '''

From 0b3f8b3cc74b2b4dfa03dad30fd33d4b80c9a7c3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 15 Aug 2014 18:22:09 +0100
Subject: [PATCH 03/14] Added kronecker and variational gaussian approximation
 gp's, vargpapprox needs generalising to any factorizing likelihood

---
 GPy/models/__init__.py                        |   2 +
 .../gp_kronecker_gaussian_regression.py       | 119 ++++++++++++++++++
 GPy/models/gp_var_gauss.py                    | 108 ++++++++++++++++
 GPy/testing/model_tests.py                    |  39 ++++++
 4 files changed, 268 insertions(+)
 create mode 100644 GPy/models/gp_kronecker_gaussian_regression.py
 create mode 100644 GPy/models/gp_var_gauss.py

diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 1fb781d5..cce26783 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -18,3 +18,5 @@ from gp_coregionalized_regression import GPCoregionalizedRegression
 from sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from gp_heteroscedastic_regression import GPHeteroscedasticRegression
 from ss_mrd import SSMRD
+from gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
+from gp_var_gauss import GPVariationalGaussianApproximation
diff --git a/GPy/models/gp_kronecker_gaussian_regression.py b/GPy/models/gp_kronecker_gaussian_regression.py
new file mode 100644
index 00000000..0e8dab81
--- /dev/null
+++ b/GPy/models/gp_kronecker_gaussian_regression.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2014, James Hensman, Alan Saul
+# Distributed under the terms of the GNU General public License, see LICENSE.txt
+
+import numpy as np
+from ..core.model import Model
+from ..core.parameterization import ObsAr
+from .. import likelihoods
+
+class GPKroneckerGaussianRegression(Model):
+    """
+    Kronecker GP regression
+
+    Take two kernels computed on separate spaces K1(X1), K2(X2), and a data
+    matrix Y which is f size (N1, N2).
+
+    The effective covaraince is np.kron(K2, K1)
+    The effective data is vec(Y) = Y.flatten(order='F')
+
+    The noise must be iid Gaussian.
+
+    See Stegle et al.
+    @inproceedings{stegle2011efficient,
+      title={Efficient inference in matrix-variate gaussian models with $\\backslash$ iid observation noise},
+      author={Stegle, Oliver and Lippert, Christoph and Mooij, Joris M and Lawrence, Neil D and Borgwardt, Karsten M},
+      booktitle={Advances in Neural Information Processing Systems},
+      pages={630--638},
+      year={2011}
+    }
+
+    """
+    def __init__(self, X1, X2, Y, kern1, kern2, noise_var=1., name='KGPR'):
+        Model.__init__(self, name=name)
+        # accept the construction arguments
+        self.X1 = ObsAr(X1)
+        self.X2 = ObsAr(X2)
+        self.Y = Y
+        self.kern1, self.kern2 = kern1, kern2
+        self.add_parameter(self.kern1)
+        self.add_parameter(self.kern2)
+
+        self.likelihood = likelihoods.Gaussian()
+        self.likelihood.variance = noise_var
+        self.add_parameter(self.likelihood)
+
+        self.num_data1, self.input_dim1 = self.X1.shape
+        self.num_data2, self.input_dim2 = self.X2.shape
+
+        assert kern1.input_dim == self.input_dim1
+        assert kern2.input_dim == self.input_dim2
+        assert Y.shape == (self.num_data1, self.num_data2)
+
+    def log_likelihood(self):
+        return self._log_marginal_likelihood
+
+    def parameters_changed(self):
+        (N1, D1), (N2, D2) = self.X1.shape, self.X2.shape
+        K1, K2 = self.kern1.K(self.X1), self.kern2.K(self.X2)
+
+        # eigendecompositon
+        S1, U1 = np.linalg.eigh(K1)
+        S2, U2 = np.linalg.eigh(K2)
+        W = np.kron(S2, S1) + self.likelihood.variance
+
+        Y_ = U1.T.dot(self.Y).dot(U2)
+
+        # store these quantities: needed for prediction
+        Wi = 1./W
+        Ytilde = Y_.flatten(order='F')*Wi
+
+        self._log_marginal_likelihood = -0.5*self.num_data1*self.num_data2*np.log(2*np.pi)\
+                                        -0.5*np.sum(np.log(W))\
+                                        -0.5*np.dot(Y_.flatten(order='F'), Ytilde)
+
+        # gradients for data fit part
+        Yt_reshaped = Ytilde.reshape(N1, N2, order='F')
+        tmp = U1.dot(Yt_reshaped)
+        dL_dK1 = .5*(tmp*S2).dot(tmp.T)
+        tmp = U2.dot(Yt_reshaped.T)
+        dL_dK2 = .5*(tmp*S1).dot(tmp.T)
+
+        # gradients for logdet
+        Wi_reshaped = Wi.reshape(N1, N2, order='F')
+        tmp = np.dot(Wi_reshaped, S2)
+        dL_dK1 += -0.5*(U1*tmp).dot(U1.T)
+        tmp = np.dot(Wi_reshaped.T, S1)
+        dL_dK2 += -0.5*(U2*tmp).dot(U2.T)
+
+        self.kern1.update_gradients_full(dL_dK1, self.X1)
+        self.kern2.update_gradients_full(dL_dK2, self.X2)
+
+        # gradients for noise variance
+        dL_dsigma2 = -0.5*Wi.sum() + 0.5*np.sum(np.square(Ytilde))
+        self.likelihood.variance.gradient = dL_dsigma2
+
+        # store these quantities for prediction:
+        self.Wi, self.Ytilde, self.U1, self.U2 = Wi, Ytilde, U1, U2
+
+    def predict(self, X1new, X2new):
+        """
+        Return the predictive mean and variance at a series of new points X1new, X2new
+        Only returns the diagonal of the predictive variance, for now.
+
+        :param X1new: The points at which to make a prediction
+        :type X1new: np.ndarray, Nnew x self.input_dim1
+        :param X2new: The points at which to make a prediction
+        :type X2new: np.ndarray, Nnew x self.input_dim2
+
+        """
+        k1xf = self.kern1.K(X1new, self.X1)
+        k2xf = self.kern2.K(X2new, self.X2)
+        A = k1xf.dot(self.U1)
+        B = k2xf.dot(self.U2)
+        mu = A.dot(self.Ytilde.reshape(self.num_data1, self.num_data2, order='F')).dot(B.T).flatten(order='F')
+        k1xx = self.kern1.Kdiag(X1new)
+        k2xx = self.kern2.Kdiag(X2new)
+        BA = np.kron(B, A)
+        var = np.kron(k2xx, k1xx) - np.sum(BA**2*self.Wi, 1) + self.likelihood.variance
+
+        return mu[:, None], var[:, None]
diff --git a/GPy/models/gp_var_gauss.py b/GPy/models/gp_var_gauss.py
new file mode 100644
index 00000000..68b62443
--- /dev/null
+++ b/GPy/models/gp_var_gauss.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2014, James Hensman, Alan Saul
+# Distributed under the terms of the GNU General public License, see LICENSE.txt
+
+import numpy as np
+from scipy import stats
+from scipy.special import erf
+from ..core.model import Model
+from ..core.parameterization import ObsAr
+from .. import kern
+from ..core.parameterization.param import Param
+from ..util.linalg import pdinv
+
+log_2_pi = np.log(2*np.pi)
+
+
+class GPVariationalGaussianApproximation(Model):
+    """
+    The Variational Gaussian Approximation revisited implementation for regression
+
+    @article{Opper:2009,
+        title = {The Variational Gaussian Approximation Revisited},
+        author = {Opper, Manfred and Archambeau, C{\'e}dric},
+        journal = {Neural Comput.},
+        year = {2009},
+        pages = {786--792},
+    }
+    """
+    def __init__(self, X, Y, kernel=None):
+        Model.__init__(self,'Variational GP classification')
+        # accept the construction arguments
+        self.X = ObsAr(X)
+        if kernel is None:
+            kernel = kern.RBF(X.shape[1]) + kern.White(X.shape[1], 0.01)
+        self.kern = kernel
+        self.add_parameter(self.kern)
+        self.num_data, self.input_dim = self.X.shape
+
+        self.alpha = Param('alpha', np.zeros(self.num_data))
+        self.beta = Param('beta', np.ones(self.num_data))
+        self.add_parameter(self.alpha)
+        self.add_parameter(self.beta)
+
+        self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
+        self.Ysign = np.where(Y==1, 1, -1).flatten()
+
+    def log_likelihood(self):
+        """
+        Marginal log likelihood evaluation
+        """
+        return self._log_lik
+
+    def likelihood_quadrature(self, m, v):
+        """
+        Perform Gauss-Hermite quadrature over the log of the likelihood, with a fixed weight
+        """
+        # assume probit for now.
+        X = self.gh_x[None, :]*np.sqrt(2.*v[:, None]) + (m*self.Ysign)[:, None]
+        p = stats.norm.cdf(X)
+        N = stats.norm.pdf(X)
+        F = np.log(p).dot(self.gh_w)
+        NoverP = N/p
+        dF_dm = (NoverP*self.Ysign[:,None]).dot(self.gh_w)
+        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
+        return F, dF_dm, dF_dv
+
+    def parameters_changed(self):
+        K = self.kern.K(self.X)
+        m = K.dot(self.alpha)
+        KB = K*self.beta[:, None]
+        BKB = KB*self.beta[None, :]
+        A = np.eye(self.num_data) + BKB
+        Ai, LA, _, Alogdet = pdinv(A)
+        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
+        var = np.diag(Sigma)
+
+        F, dF_dm, dF_dv = self.likelihood_quadrature(m, var)
+        dF_da = np.dot(K, dF_dm)
+        SigmaB = Sigma*self.beta
+        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv)).dot(SigmaB))*2
+        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + m.dot(self.alpha))
+        dKL_da = m
+        A_A2 = Ai - Ai.dot(Ai)
+        dKL_db = np.diag(np.dot(KB.T, A_A2))
+        self._log_lik = F.sum() - KL
+        self.alpha.gradient = dF_da - dKL_da
+        self.beta.gradient = dF_db - dKL_db
+
+        # K-gradients
+        dKL_dK = 0.5*(self.alpha[None, :]*self.alpha[:, None] + self.beta[:, None]*self.beta[None, :]*A_A2)
+        tmp = Ai*self.beta[:, None]/self.beta[None, :]
+        dF_dK = self.alpha[:, None]*dF_dm[None, :] + np.dot(tmp*dF_dv, tmp.T)
+        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
+
+    def predict(self, Xnew):
+        """
+        Predict the function(s) at the new point(s) Xnew.
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        """
+        Wi, _, _, _ = pdinv(self.kern.K(self.X) + np.diag(self.beta**-2))
+        Kux = self.kern.K(self.X, Xnew)
+        mu = np.dot(Kux.T, self.alpha)
+        WiKux = np.dot(Wi, Kux)
+        Kxx = self.kern.Kdiag(Xnew)
+        var = Kxx - np.sum(WiKux*Kux, 0)
+
+        return 0.5*(1+erf(mu/np.sqrt(2.*(var+1))))
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 7b19538b..451ab757 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -423,6 +423,45 @@ class GradientTests(np.testing.TestCase):
         m = GPy.models.GPHeteroscedasticRegression(X,Y,kern)
         self.assertTrue(m.checkgrad())
 
+    def test_gp_kronecker_gaussian(self):
+        N1, N2 = 30, 20
+        X1 = np.random.randn(N1, 1)
+        X2 = np.random.randn(N2, 1)
+        X1.sort(0); X2.sort(0)
+        k1 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
+        k2 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
+        Y = np.random.randn(N1, N2)
+        m = GPy.models.GPKroneckerGaussianRegression(X1, X2, Y, k1, k2)
+
+        # build the model the dumb way
+        assert (N1*N2<1000), "too much data for standard GPs!"
+        yy, xx = np.meshgrid(X2, X1)
+        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
+        kg = GPy.kern.RBF(1, active_dims=[0]) * GPy.kern.RBF(1, active_dims=[1])
+        mm = GPy.models.GPRegression(Xgrid, Y.reshape(-1, 1, order='F'), kernel=kg)
+
+        m.randomize()
+        mm[:] = m[:]
+        assert np.allclose(m.log_likelihood(), mm.log_likelihood())
+        assert np.allclose(m.gradient, mm.gradient)
+        X1test = np.random.randn(100, 1)
+        X2test = np.random.randn(100, 1)
+        mean1, var1 = m.predict(X1test, X2test)
+        yy, xx = np.meshgrid(X2test, X1test)
+        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
+        mean2, var2 = mm.predict(Xgrid)
+        assert np.allclose(mean1, mean2)
+        assert np.allclose(var1, var2)
+
+    def test_gp_VGPC(self):
+        num_obs = 25
+        X = np.random.randint(0,140,num_obs)
+        X = X[:,None]
+        Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
+        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
+        m = GPy.models.GPVariationalGaussianApproximation(X,Y,kern)
+        self.assertTrue(m.checkgrad())
+
 
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."

From 6b8ac70210a3c6ebccfcddf99068a12669d7bfd4 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 18 Aug 2014 15:17:52 +0100
Subject: [PATCH 04/14] minor changes on SSGPLVM

---
 GPy/models/ss_gplvm.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index d1b07415..027f9783 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -24,7 +24,7 @@ class SSGPLVM(SparseGP):
 
     """
     def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, **kwargs):
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, pi=None, learnPi=True, **kwargs):
 
         self.mpi_comm = mpi_comm
         self.__IN_OPTIMIZATION__ = False
@@ -49,9 +49,6 @@ class SSGPLVM(SparseGP):
         if Z is None:
             Z = np.random.permutation(X.copy())[:num_inducing]
         assert Z.shape[1] == X.shape[1]
-
-        pi = np.empty((input_dim))
-        pi[:] = 0.5
         
         if likelihood is None:
             likelihood = Gaussian()
@@ -64,7 +61,10 @@ class SSGPLVM(SparseGP):
         if inference_method is None:
             inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
 
-        self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=True) # the prior probability of the latent binary variable b
+        if pi is None:
+            pi = np.empty((input_dim))
+            pi[:] = 0.5
+        self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi) # the prior probability of the latent binary variable b
         
         X = SpikeAndSlabPosterior(X, X_variance, gamma)
         

From 3d322301a2659a43e87c22a333d9f6f320ab939a Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 18 Aug 2014 16:44:15 +0100
Subject: [PATCH 05/14] linear kernel psi statistics performance optimization

---
 GPy/kern/_src/psi_comp/linear_psi_comp.py   | 11 ++++-------
 GPy/kern/_src/psi_comp/sslinear_psi_comp.py | 18 +++++++-----------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py
index 4f064a59..94cba0f5 100644
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@@ -21,9 +21,7 @@ def psicomputations(variance, Z, variational_posterior):
 
     psi0 = np.einsum('q,nq->n',variance,np.square(mu)+S)
     psi1 = np.einsum('q,mq,nq->nm',variance,Z,mu)
-
-    tmp = np.einsum('q,mq,nq->nm',variance,Z,mu)
-    psi2 = np.einsum('q,mq,oq,nq->mo',np.square(variance),Z,Z,S) + np.einsum('nm,no->mo',tmp,tmp)
+    psi2 = np.einsum('q,mq,oq,nq->mo',np.square(variance),Z,Z,S) + np.einsum('nm,no->mo',psi1,psi1)
 
     return psi0, psi1, psi2
 
@@ -58,13 +56,12 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):
     
     variance2 = np.square(variance)
     common_sum = np.einsum('q,mq,nq->nm',variance,Z,mu) # NxM
+    dL_dpsi2_2 = dL_dpsi2+dL_dpsi2.T
 
     dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*S,variance,Z,Z)+\
-        np.einsum('mo,mq,nq,no->q',dL_dpsi2,Z,mu,common_sum)+\
-        np.einsum('mo,oq,nq,nm->q',dL_dpsi2,Z,mu,common_sum)
+        np.einsum('mo,mq,nq,no->q',dL_dpsi2_2,Z,mu,common_sum)
             
-    dL_dmu = np.einsum('mo,q,mq,no->nq',dL_dpsi2,variance,Z,common_sum)+\
-        np.einsum('mo,q,oq,nm->nq',dL_dpsi2,variance,Z,common_sum)
+    dL_dmu = np.einsum('mo,q,mq,no->nq',dL_dpsi2_2,variance,Z,common_sum)
     
     dL_dS = np.empty(S.shape)
     dL_dS[:] = np.einsum('mo,q,mq,oq->q',dL_dpsi2,variance2,Z,Z)
diff --git a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
index ec81c40f..dc864b3c 100644
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@@ -22,11 +22,8 @@ def psicomputations(variance, Z, variational_posterior):
 
     psi0 = np.einsum('q,nq,nq->n',variance,gamma,np.square(mu)+S)
     psi1 = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu)
-    mu2 = np.square(mu)
-    variances2 = np.square(variance)
-    tmp = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu)
-    psi2 = np.einsum('nq,q,mq,oq,nq->mo',gamma,variances2,Z,Z,mu2+S)+\
-           np.einsum('nm,no->mo',tmp,tmp) - np.einsum('nq,q,mq,oq,nq->mo',np.square(gamma),variances2,Z,Z,mu2)
+    psi2 = np.einsum('nq,q,mq,oq,nq->mo',gamma,np.square(variance),Z,Z,(1-gamma)*np.square(mu)+S) +\
+                np.einsum('nm,no->mo',psi1,psi1) 
 
     return psi0, psi1, psi2
 
@@ -67,18 +64,17 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
     variance2 = np.square(variance)
     mu2S = mu2+S # NxQ
     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
+    
+    dL_dpsi2_2 = dL_dpsi2+dL_dpsi2.T
 
     dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\
-        np.einsum('mo,nq,mq,nq,no->q',dL_dpsi2,gamma,Z,mu,common_sum)+\
-        np.einsum('mo,nq,oq,nq,nm->q',dL_dpsi2,gamma,Z,mu,common_sum)
+        np.einsum('mo,nq,mq,nq,no->q',dL_dpsi2_2,gamma,Z,mu,common_sum)
         
     dL_dgamma = np.einsum('mo,q,mq,oq,nq->nq',dL_dpsi2,variance2,Z,Z,(mu2S-2.*gamma*mu2))+\
-        np.einsum('mo,q,mq,nq,no->nq',dL_dpsi2,variance,Z,mu,common_sum)+\
-        np.einsum('mo,q,oq,nq,nm->nq',dL_dpsi2,variance,Z,mu,common_sum)
+        np.einsum('mo,q,mq,nq,no->nq',dL_dpsi2_2,variance,Z,mu,common_sum)
     
     dL_dmu = np.einsum('mo,q,mq,oq,nq,nq->nq',dL_dpsi2,variance2,Z,Z,mu,2.*(gamma-gamma2))+\
-        np.einsum('mo,nq,q,mq,no->nq',dL_dpsi2,gamma,variance,Z,common_sum)+\
-        np.einsum('mo,nq,q,oq,nm->nq',dL_dpsi2,gamma,variance,Z,common_sum)
+        np.einsum('mo,nq,q,mq,no->nq',dL_dpsi2_2,gamma,variance,Z,common_sum)
                     
     dL_dS = np.einsum('mo,nq,q,mq,oq->nq',dL_dpsi2,gamma,variance2,Z,Z)
     

From abcea702911bcc7b8cf7e9875e6dfe861546efb5 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 18 Aug 2014 17:30:59 +0100
Subject: [PATCH 06/14] some further performance improvement for linear kernel
 psi statistics

---
 GPy/kern/_src/linear.py                     |  1 -
 GPy/kern/_src/psi_comp/linear_psi_comp.py   | 10 +++++-----
 GPy/kern/_src/psi_comp/sslinear_psi_comp.py | 21 ++++++++++-----------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 41ec3cae..6874c0d6 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -113,7 +113,6 @@ class Linear(Kern):
     def psi1(self, Z, variational_posterior):
         return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
 
-    @Cache_this(limit=1)
     def psi2(self, Z, variational_posterior):
         return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
 
diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py
index 94cba0f5..93297e7e 100644
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@@ -56,15 +56,15 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):
     
     variance2 = np.square(variance)
     common_sum = np.einsum('q,mq,nq->nm',variance,Z,mu) # NxM
-    dL_dpsi2_2 = dL_dpsi2+dL_dpsi2.T
+    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
 
-    dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*S,variance,Z,Z)+\
-        np.einsum('mo,mq,nq,no->q',dL_dpsi2_2,Z,mu,common_sum)
+    dL_dvar = np.einsum('q,nq,q->q',Z_expect,2.*S,variance)+ np.einsum('nq,nq->q',common_expect,mu)
             
-    dL_dmu = np.einsum('mo,q,mq,no->nq',dL_dpsi2_2,variance,Z,common_sum)
+    dL_dmu = np.einsum('nq,q->nq',common_expect,variance)
     
     dL_dS = np.empty(S.shape)
-    dL_dS[:] = np.einsum('mo,q,mq,oq->q',dL_dpsi2,variance2,Z,Z)
+    dL_dS[:] = np.einsum('q,q->q',Z_expect,variance2)
     
     dL_dZ = 2.*(np.einsum('om,q,mq,nq->oq',dL_dpsi2,variance2,Z,S)+np.einsum('om,q,nq,nm->oq',dL_dpsi2,variance,mu,common_sum))
 
diff --git a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
index dc864b3c..b505f26f 100644
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@@ -64,21 +64,20 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
     variance2 = np.square(variance)
     mu2S = mu2+S # NxQ
     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
+    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
     
-    dL_dpsi2_2 = dL_dpsi2+dL_dpsi2.T
-
-    dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\
-        np.einsum('mo,nq,mq,nq,no->q',dL_dpsi2_2,gamma,Z,mu,common_sum)
+    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
+        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
         
-    dL_dgamma = np.einsum('mo,q,mq,oq,nq->nq',dL_dpsi2,variance2,Z,Z,(mu2S-2.*gamma*mu2))+\
-        np.einsum('mo,q,mq,nq,no->nq',dL_dpsi2_2,variance,Z,mu,common_sum)
+    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
+        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
     
-    dL_dmu = np.einsum('mo,q,mq,oq,nq,nq->nq',dL_dpsi2,variance2,Z,Z,mu,2.*(gamma-gamma2))+\
-        np.einsum('mo,nq,q,mq,no->nq',dL_dpsi2_2,gamma,variance,Z,common_sum)
+    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
+            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
                     
-    dL_dS = np.einsum('mo,nq,q,mq,oq->nq',dL_dpsi2,gamma,variance2,Z,Z)
+    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
     
-    dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,mu2S)+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum)
-                -np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma2,variance2,Z,mu2))
+    dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
 
     return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ

From f5f90b3a2c9b3043a5765c5c8f22e552ab89bf8c Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 18 Aug 2014 18:03:08 +0100
Subject: [PATCH 07/14] fix the pickle problem for models with psi statistics

---
 GPy/kern/_src/psi_comp/__init__.py |  4 ++--
 GPy/models/ss_gplvm.py             | 13 ++++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py
index 5aabd3b1..48abc68c 100644
--- a/GPy/kern/_src/psi_comp/__init__.py
+++ b/GPy/kern/_src/psi_comp/__init__.py
@@ -9,7 +9,7 @@ import ssrbf_psi_comp
 import sslinear_psi_comp
 import linear_psi_comp
 
-class PSICOMP_RBF(object):
+class PSICOMP_RBF(Pickleable):
 
     @Cache_this(limit=2, ignore_args=(0,))
     def psicomputations(self, variance, lengthscale, Z, variational_posterior):
@@ -29,7 +29,7 @@ class PSICOMP_RBF(object):
         else:
             raise ValueError, "unknown distriubtion received for psi-statistics"
         
-class PSICOMP_Linear(object):
+class PSICOMP_Linear(Pickleable):
 
     @Cache_this(limit=2, ignore_args=(0,))
     def psicomputations(self, variance, Z, variational_posterior):
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 027f9783..ba793fc2 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -23,7 +23,7 @@ class SSGPLVM(SparseGP):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+    def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10,
                  Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, pi=None, learnPi=True, **kwargs):
 
         self.mpi_comm = mpi_comm
@@ -41,10 +41,13 @@ class SSGPLVM(SparseGP):
         if X_variance is None: # The variance of the variational approximation (S)
             X_variance = np.random.uniform(0,.1,X.shape)
             
-        gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
-        gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
-        gamma[gamma>1.-1e-9] = 1.-1e-9
-        gamma[gamma<1e-9] = 1e-9
+        if Gamma is None:
+            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
+        else:
+            gamma = Gamma.copy()
                 
         if Z is None:
             Z = np.random.permutation(X.copy())[:num_inducing]

From b9fdbedf20d4292edbcaaeaeacc9b45238207364 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 18 Aug 2014 18:14:16 +0100
Subject: [PATCH 08/14] update additive kernel for SSGPLVM

---
 GPy/kern/_src/add.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 2b0e531f..259c2018 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -118,9 +118,9 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
                 else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
             p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
 
     def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
@@ -135,16 +135,15 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
                 else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
             target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
         return target
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         from static import White, Bias
-        target_mu = np.zeros(variational_posterior.shape)
-        target_S = np.zeros(variational_posterior.shape)
+        target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
         for p1 in self.parameters:
             #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
             eff_dL_dpsi1 = dL_dpsi1.copy()
@@ -154,13 +153,12 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
                 else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
-            a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-            target_mu += a
-            target_S += b
-        return target_mu, target_S
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+            grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+            [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
+        return target_grads
 
     def add(self, other, name='sum'):
         if isinstance(other, Add):

From 80adaed6168b21405a59ae0cd6e134a262a6ea66 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 19 Aug 2014 12:12:07 +0100
Subject: [PATCH 09/14] a bug fix for psi statistics related model pickle

---
 GPy/kern/_src/psi_comp/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py
index 48abc68c..7a5851fb 100644
--- a/GPy/kern/_src/psi_comp/__init__.py
+++ b/GPy/kern/_src/psi_comp/__init__.py
@@ -47,4 +47,7 @@ class PSICOMP_Linear(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
\ No newline at end of file
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+        
+    def _setup_observers(self):
+        pass
\ No newline at end of file

From 22de3ab676589afea0bc19575d554e6b3d1ae383 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 19 Aug 2014 10:37:48 -0700
Subject: [PATCH 10/14] [pickling] wb as write parameter

---
 GPy/core/parameterization/parameter_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 4edb0520..16c382b8 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -191,7 +191,7 @@ class Pickleable(object):
         """
         import cPickle as pickle
         if isinstance(f, str):
-            with open(f, 'w') as f:
+            with open(f, 'wb') as f:
                 pickle.dump(self, f, protocol)
         else:
             pickle.dump(self, f, protocol)

From 3972b4bd9a11a86d7d5fcb2dbb1b81a81eba5e37 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 19 Aug 2014 10:41:53 -0700
Subject: [PATCH 11/14] [linear] einsums

---
 GPy/kern/_src/linear.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 6874c0d6..e56e793e 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -51,7 +51,7 @@ class Linear(Kern):
         self.variances = Param('variances', variances, Logexp())
         self.add_parameter(self.variances)
         self.psicomp = PSICOMP_Linear()
-    
+
     @Cache_this(limit=2)
     def K(self, X, X2=None):
         if self.ARD:
@@ -76,10 +76,12 @@ class Linear(Kern):
     def update_gradients_full(self, dL_dK, X, X2=None):
         if self.ARD:
             if X2 is None:
-                self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
+                #self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
+                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X)
             else:
-                product = X[:, None, :] * X2[None, :, :]
-                self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0)
+                #product = X[:, None, :] * X2[None, :, :]
+                #self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0)
+                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X2)
         else:
             self.variances.gradient = np.sum(self._dot_product(X, X2) * dL_dK)
 
@@ -93,9 +95,10 @@ class Linear(Kern):
 
     def gradients_X(self, dL_dK, X, X2=None):
         if X2 is None:
-            return np.einsum('mq,nm->nq',X*self.variances,dL_dK)+np.einsum('nq,nm->mq',X*self.variances,dL_dK)
+            return np.einsum('jq,q,ij->iq', X, 2*self.variances, dL_dK)
         else:
-            return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
 
     def gradients_X_diag(self, dL_dKdiag, X):
         return 2.*self.variances*dL_dKdiag[:,None]*X

From d00089387892ea2268b8673b8db7b73dec9e73d9 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Mon, 25 Aug 2014 09:46:20 -0700
Subject: [PATCH 12/14] [ard] enhanced ard handling and plotting

Conflicts:
	GPy/kern/_src/linear.py
	GPy/models/ss_gplvm.py
---
 GPy/core/gp.py                           |   4 +-
 GPy/kern/_src/add.py                     |  19 +-
 GPy/kern/_src/kern.py                    |  16 +-
 GPy/kern/_src/linear.py                  |   1 -
 GPy/kern/_src/static.py                  |   6 +-
 GPy/kern/_src/stationary.py              |   4 +-
 GPy/models/ss_gplvm.py                   | 355 +++++++++++++++++------
 GPy/plotting/matplot_dep/kernel_plots.py |  36 ++-
 8 files changed, 323 insertions(+), 118 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index ba7c38f2..f3111e60 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -285,11 +285,11 @@ class GP(Model):
                                      plot_raw=plot_raw, Y_metadata=Y_metadata,
                                      data_symbol=data_symbol, **kw)
 
-    def input_sensitivity(self):
+    def input_sensitivity(self, summarize=True):
         """
         Returns the sensitivity for each dimension of this model
         """
-        return self.kern.input_sensitivity()
+        return self.kern.input_sensitivity(summarize=summarize)
 
     def optimize(self, optimizer=None, start=None, **kwargs):
         """
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 259c2018..27f8ebd1 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -14,6 +14,13 @@ class Add(CombinationKernel):
     This kernel will take over the active dims of it's subkernels passed in.
     """
     def __init__(self, subkerns, name='add'):
+        for i, kern in enumerate(subkerns[:]):
+            if isinstance(kern, Add):
+                del subkerns[i]
+                for part in kern.parts[::-1]:
+                    kern.remove_parameter(part)
+                    subkerns.insert(i, part)
+
         super(Add, self).__init__(subkerns, name)
 
     @Cache_this(limit=2, force_kwargs=['which_parts'])
@@ -160,7 +167,7 @@ class Add(CombinationKernel):
             [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
         return target_grads
 
-    def add(self, other, name='sum'):
+    def add(self, other):
         if isinstance(other, Add):
             other_params = other.parameters[:]
             for p in other_params:
@@ -171,5 +178,11 @@ class Add(CombinationKernel):
         self.input_dim, self.active_dims = self.get_input_dim_active_dims(self.parts)
         return self
 
-    def input_sensitivity(self):
-        return reduce(np.add, [k.input_sensitivity() for k in self.parts])
+    def input_sensitivity(self, summarize=True):
+        if summarize:
+            return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts])
+        else:
+            i_s = np.zeros((len(self.parts), self.input_dim))
+            from operator import setitem
+            [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
+            return i_s
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index d6e761c3..d8377ffc 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -23,9 +23,9 @@ class Kern(Parameterized):
 
         input_dim:
 
-            is the number of dimensions to work on. Make sure to give the 
+            is the number of dimensions to work on. Make sure to give the
             tight dimensionality of inputs.
-            You most likely want this to be the integer telling the number of 
+            You most likely want this to be the integer telling the number of
             input dimensions of the kernel.
             If this is not an integer (!) we will work on the whole input matrix X,
             and not check whether dimensions match or not (!).
@@ -134,7 +134,7 @@ class Kern(Parameterized):
         from ...plotting.matplot_dep import kernel_plots
         return kernel_plots.plot_ARD(self,*args,**kw)
 
-    def input_sensitivity(self):
+    def input_sensitivity(self, summarize=True):
         """
         Returns the sensitivity for each dimension of this kernel.
         """
@@ -144,6 +144,9 @@ class Kern(Parameterized):
         """ Overloading of the '+' operator. for more control, see self.add """
         return self.add(other)
 
+    def __iadd__(self, other):
+        return self.add(other)
+
     def add(self, other, name='add'):
         """
         Add another kernel to this one.
@@ -235,7 +238,12 @@ class CombinationKernel(Kern):
         active_dims = np.arange(input_dim)
         return input_dim, active_dims
 
-    def input_sensitivity(self):
+    def input_sensitivity(self, summarize=True):
+        """
+        If summize is true, we want to get the summerized view of the sensitivities,
+        otherwise put everything into an array with shape (#kernels, input_dim)
+        in the order of appearance of the kernels in the parameterized object.
+        """
         raise NotImplementedError("Choose the kernel you want to get the sensitivity for. You need to override the default behaviour for getting the input sensitivity to be able to get the input sensitivity. For sum kernel it is the sum of all sensitivities, TODO: product kernel? Other kernels?, also TODO: shall we return all the sensitivities here in the combination kernel? So we can combine them however we want? This could lead to just plot all the sensitivities here...")
 
     def _check_active_dims(self, X):
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index e56e793e..9fdacdbb 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -132,7 +132,6 @@ class Linear(Kern):
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
 
-
 class LinearFull(Kern):
     def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
         super(LinearFull, self).__init__(input_dim, active_dims, name)
diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index e191b569..7820c634 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -40,6 +40,11 @@ class Static(Kern):
         K = self.K(variational_posterior.mean, Z)
         return np.einsum('ij,ik->jk',K,K) #K[:,:,None]*K[:,None,:] # NB. more efficient implementations on inherriting classes
 
+    def input_sensitivity(self, summarize=True):
+        if summarize:
+            return super(Static, self).input_sensitivity(summarize=summarize)
+        else:
+            return np.ones(self.input_dim) * self.variance
 
 class White(Static):
     def __init__(self, input_dim, variance=1., active_dims=None, name='white'):
@@ -63,7 +68,6 @@ class White(Static):
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         self.variance.gradient = dL_dpsi0.sum()
 
-
 class Bias(Static):
     def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
         super(Bias, self).__init__(input_dim, variance, active_dims, name)
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index c0553238..f7993e82 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -179,7 +179,7 @@ class Stationary(Kern):
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
 
-    def input_sensitivity(self):
+    def input_sensitivity(self, summarize=True):
         return np.ones(self.input_dim)/self.lengthscale**2
 
 class Exponential(Stationary):
@@ -340,7 +340,7 @@ class RatQuad(Stationary):
     """
 
 
-    def __init__(self, input_dim, variance=1., lengthscale=None, power=2., ARD=False, active_dims=None, name='ExpQuad'):
+    def __init__(self, input_dim, variance=1., lengthscale=None, power=2., ARD=False, active_dims=None, name='RatQuad'):
         super(RatQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
         self.power = Param('power', power, Logexp())
         self.add_parameters(self.power)
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index ba793fc2..ba66eec3 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -2,14 +2,18 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
+import itertools
+from matplotlib import pyplot
 
 from ..core.sparse_gp import SparseGP
 from .. import kern
 from ..likelihoods import Gaussian
+from ..inference.optimization import SCG
+from ..util import linalg
 from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
-from ..kern._src.psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF_GPU
+
 
 class SSGPLVM(SparseGP):
     """
@@ -23,13 +27,9 @@ class SSGPLVM(SparseGP):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, pi=None, learnPi=True, **kwargs):
+    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', group_spike=False, **kwargs):
 
-        self.mpi_comm = mpi_comm
-        self.__IN_OPTIMIZATION__ = False
-        self.group_spike = group_spike
-        
         if X == None:
             from ..util.initialization import initialize_latent
             X, fracs = initialize_latent(init, input_dim, Y)
@@ -40,66 +40,49 @@ class SSGPLVM(SparseGP):
 
         if X_variance is None: # The variance of the variational approximation (S)
             X_variance = np.random.uniform(0,.1,X.shape)
-            
-        if Gamma is None:
-            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
-            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
-            gamma[gamma>1.-1e-9] = 1.-1e-9
-            gamma[gamma<1e-9] = 1e-9
-        else:
-            gamma = Gamma.copy()
-                
+
+        gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
+        gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
+
+        if group_spike:
+            gamma[:] = gamma.mean(axis=0)
+
         if Z is None:
             Z = np.random.permutation(X.copy())[:num_inducing]
         assert Z.shape[1] == X.shape[1]
-        
+
         if likelihood is None:
             likelihood = Gaussian()
 
         if kernel is None:
             kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim)
-        if kernel.useGPU:
-            kernel.psicomp = PSICOMP_SSRBF_GPU()
-        
-        if inference_method is None:
-            inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
 
-        if pi is None:
-            pi = np.empty((input_dim))
-            pi[:] = 0.5
-        self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi) # the prior probability of the latent binary variable b
-        
+        pi = np.empty((input_dim))
+        pi[:] = 0.5
+        self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b
+
+        X = np.asfortranarray(X)
+        X_variance = np.asfortranarray(X_variance)
+        gamma = np.asfortranarray(gamma)
         X = SpikeAndSlabPosterior(X, X_variance, gamma)
-        
+
+        if group_spike:
+            kernel.group_spike_prob = True
+            self.variational_prior.group_spike_prob = True
+
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
         self.add_parameter(self.X, index=0)
         self.add_parameter(self.variational_prior)
-                
-        if mpi_comm != None:
-            from ..util.mpi import divide_data
-            N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm)
-            self.N_range = (N_start, N_end)
-            self.N_list = np.array(N_list)
-            self.Y_local = self.Y[N_start:N_end]
-            print 'MPI RANK: '+str(self.mpi_comm.rank)+' with datasize: '+str(self.N_range)
-            mpi_comm.Bcast(self.param_array, root=0)
-        
-        if self.group_spike:
-            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in xrange(self.X.gamma.shape[1])] # Tie columns together
-        
+
     def set_X_gradients(self, X, X_grad):
         """Set the gradients of the posterior distribution of X in its specific form."""
         X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad
-    
-    def get_X_gradients(self, X):
-        """Get the gradients of the posterior distribution of X in its specific form."""
-        return X.mean.gradient, X.variance.gradient, X.binary_prob.gradient
 
     def parameters_changed(self):
         if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
-            update_gradients(self, mpi_comm=self.mpi_comm)
+            update_gradients(self)
             return
-        
+
         super(SSGPLVM, self).parameters_changed()
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
 
@@ -108,7 +91,7 @@ class SSGPLVM(SparseGP):
         # update for the KL divergence
         self.variational_prior.update_gradients_KL(self.X)
 
-    def input_sensitivity(self):
+    def input_sensitivity(self, summarize=True):
         if self.kern.ARD:
             return self.kern.input_sensitivity()
         else:
@@ -121,47 +104,235 @@ class SSGPLVM(SparseGP):
 
         return dim_reduction_plots.plot_latent(self, plot_inducing=plot_inducing, *args, **kwargs)
 
-    def __getstate__(self):
-        dc = super(SSGPLVM, self).__getstate__()
-        dc['mpi_comm'] = None
-        if self.mpi_comm != None:
-            del dc['N_range']
-            del dc['N_list']
-            del dc['Y_local']
-        return dc
- 
-    def __setstate__(self, state):
-        return super(SSGPLVM, self).__setstate__(state)
-    
-    #=====================================================
-    # The MPI parallelization 
-    #     - can move to model at some point
-    #=====================================================
-    
-    def _set_params_transformed(self, p):
-        if self.mpi_comm != None:
-            if self.__IN_OPTIMIZATION__ and self.mpi_comm.rank==0:
-                self.mpi_comm.Bcast(np.int32(1),root=0)
-            self.mpi_comm.Bcast(p, root=0)
-        super(SSGPLVM, self)._set_params_transformed(p)
-        
-    def optimize(self, optimizer=None, start=None, **kwargs):
-        self.__IN_OPTIMIZATION__ = True
-        if self.mpi_comm==None:
-            super(SSGPLVM, self).optimize(optimizer,start,**kwargs)
-        elif self.mpi_comm.rank==0:
-            super(SSGPLVM, self).optimize(optimizer,start,**kwargs)
-            self.mpi_comm.Bcast(np.int32(-1),root=0)
-        elif self.mpi_comm.rank>0:
-            x = self._get_params_transformed().copy()
-            flag = np.empty(1,dtype=np.int32)
-            while True:
-                self.mpi_comm.Bcast(flag,root=0)
-                if flag==1:
-                    self._set_params_transformed(x)
-                elif flag==-1:
-                    break
-                else:
-                    self.__IN_OPTIMIZATION__ = False
-                    raise Exception("Unrecognizable flag for synchronization!")
-        self.__IN_OPTIMIZATION__ = False
+    def do_test_latents(self, Y):
+        """
+        Compute the latent representation for a set of new points Y
+
+        Notes:
+        This will only work with a univariate Gaussian likelihood (for now)
+        """
+        assert not self.likelihood.is_heteroscedastic
+        N_test = Y.shape[0]
+        input_dim = self.Z.shape[1]
+        means = np.zeros((N_test, input_dim))
+        covars = np.zeros((N_test, input_dim))
+
+        dpsi0 = -0.5 * self.output_dim * self.likelihood.precision
+        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
+        V = self.likelihood.precision * Y
+
+        #compute CPsi1V
+        if self.Cpsi1V is None:
+            psi1V = np.dot(self.psi1.T, self.likelihood.V)
+            tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
+            self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
+
+        dpsi1 = np.dot(self.Cpsi1V, V.T)
+
+        start = np.zeros(self.input_dim * 2)
+
+        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
+            args = (self.kern, self.Z, dpsi0, dpsi1_n.T, dpsi2)
+            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
+
+            mu, log_S = xopt.reshape(2, 1, -1)
+            means[n] = mu[0].copy()
+            covars[n] = np.exp(log_S[0]).copy()
+
+        return means, covars
+
+    def dmu_dX(self, Xnew):
+        """
+        Calculate the gradient of the prediction at Xnew w.r.t Xnew.
+        """
+        dmu_dX = np.zeros_like(Xnew)
+        for i in range(self.Z.shape[0]):
+            dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+        return dmu_dX
+
+    def dmu_dXnew(self, Xnew):
+        """
+        Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
+        """
+        dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
+        ones = np.ones((1, 1))
+        for i in range(self.Z.shape[0]):
+            dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
+        return np.dot(dK_dX, self.Cpsi1Vf)
+
+    def plot_steepest_gradient_map(self, fignum=None, ax=None, which_indices=None, labels=None, data_labels=None, data_marker='o', data_s=40, resolution=20, aspect='auto', updates=False, ** kwargs):
+        input_1, input_2 = significant_dims = most_significant_input_dimensions(self, which_indices)
+
+        X = np.zeros((resolution ** 2, self.input_dim))
+        indices = np.r_[:X.shape[0]]
+        if labels is None:
+            labels = range(self.output_dim)
+
+        def plot_function(x):
+            X[:, significant_dims] = x
+            dmu_dX = self.dmu_dXnew(X)
+            argmax = np.argmax(dmu_dX, 1)
+            return dmu_dX[indices, argmax], np.array(labels)[argmax]
+
+        if ax is None:
+            fig = pyplot.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        if data_labels is None:
+            data_labels = np.ones(self.num_data)
+        ulabels = []
+        for lab in data_labels:
+            if not lab in ulabels:
+                ulabels.append(lab)
+        marker = itertools.cycle(list(data_marker))
+        from GPy.util import Tango
+        for i, ul in enumerate(ulabels):
+            if type(ul) is np.string_:
+                this_label = ul
+            elif type(ul) is np.int64:
+                this_label = 'class %i' % ul
+            else:
+                this_label = 'class %i' % i
+            m = marker.next()
+            index = np.nonzero(data_labels == ul)[0]
+            x = self.X[index, input_1]
+            y = self.X[index, input_2]
+            ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
+
+        ax.set_xlabel('latent dimension %i' % input_1)
+        ax.set_ylabel('latent dimension %i' % input_2)
+
+        from matplotlib.cm import get_cmap
+        from GPy.util.latent_space_visualizations.controllers.imshow_controller import ImAnnotateController
+        if not 'cmap' in kwargs.keys():
+            kwargs.update(cmap=get_cmap('jet'))
+        controller = ImAnnotateController(ax,
+                                      plot_function,
+                                      tuple(self.X.min(0)[:, significant_dims]) + tuple(self.X.max(0)[:, significant_dims]),
+                                      resolution=resolution,
+                                      aspect=aspect,
+                                      **kwargs)
+        ax.legend()
+        ax.figure.tight_layout()
+        if updates:
+            pyplot.show()
+            clear = raw_input('Enter to continue')
+            if clear.lower() in 'yes' or clear == '':
+                controller.deactivate()
+        return controller.view
+
+    def plot_X_1d(self, fignum=None, ax=None, colors=None):
+        """
+        Plot latent space X in 1D:
+
+            - if fig is given, create input_dim subplots in fig and plot in these
+            - if ax is given plot input_dim 1D latent space plots of X into each `axis`
+            - if neither fig nor ax is given create a figure with fignum and plot in there
+
+        colors:
+            colors of different latent space dimensions input_dim
+
+        """
+        import pylab
+        if ax is None:
+            fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1]))))
+        if colors is None:
+            colors = pylab.gca()._get_lines.color_cycle
+            pylab.clf()
+        else:
+            colors = iter(colors)
+        plots = []
+        x = np.arange(self.X.shape[0])
+        for i in range(self.X.shape[1]):
+            if ax is None:
+                a = fig.add_subplot(self.X.shape[1], 1, i + 1)
+            elif isinstance(ax, (tuple, list)):
+                a = ax[i]
+            else:
+                raise ValueError("Need one ax per latent dimnesion input_dim")
+            a.plot(self.X, c='k', alpha=.3)
+            plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
+            a.fill_between(x,
+                            self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
+                            self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
+                            facecolor=plots[-1].get_color(),
+                            alpha=.3)
+            a.legend(borderaxespad=0.)
+            a.set_xlim(x.min(), x.max())
+            if i < self.X.shape[1] - 1:
+                a.set_xticklabels('')
+        pylab.draw()
+        if ax is None:
+            fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+        return fig
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return SparseGP._getstate(self) + [self.init]
+
+    def setstate(self, state):
+        self._const_jitter = None
+        self.init = state.pop()
+        SparseGP._setstate(self, state)
+
+
+def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    objective function for fitting the latent variables for test points
+    (negative log-likelihood: should be minimised!)
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    psi0 = kern.psi0(Z, mu, S)
+    psi1 = kern.psi1(Z, mu, S)
+    psi2 = kern.psi2(Z, mu, S)
+
+    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+
+    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
+    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
+    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
+
+    dmu = mu0 + mu1 + mu2 - mu
+    # dS = S0 + S1 + S2 -0.5 + .5/S
+    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
+    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
+
+def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
+    This is the same as latent_cost_and_grad but only for the objective
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    psi0 = kern.psi0(Z, mu, S)
+    psi1 = kern.psi1(Z, mu, S)
+    psi2 = kern.psi2(Z, mu, S)
+
+    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+    return -float(lik)
+
+def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    This is the same as latent_cost_and_grad but only for the grad
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
+    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
+    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
+
+    dmu = mu0 + mu1 + mu2 - mu
+    # dS = S0 + S1 + S2 -0.5 + .5/S
+    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
+
+    return -np.hstack((dmu.flatten(), dlnS.flatten()))
+
+
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 3c6b911f..60b5fa7c 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -30,18 +30,18 @@ def add_bar_labels(fig, ax, bars, bottom=0):
                 c = 'k'
                 transform = transOffsetUp
             ax.text(xi, height, "${xi}$".format(xi=int(num)), color=c, rotation=0, ha='center', va=va, transform=transform)
-    
+
     ax.set_xticks([])
 
 
 def plot_bars(fig, ax, x, ard_params, color, name, bottom=0):
     from ...util.misc import param_to_array
-    return ax.bar(left=x, height=param_to_array(ard_params), width=.8, 
-                  bottom=bottom, align='center', 
-                  color=color, edgecolor='k', linewidth=1.2, 
+    return ax.bar(left=x, height=param_to_array(ard_params), width=.8,
+                  bottom=bottom, align='center',
+                  color=color, edgecolor='k', linewidth=1.2,
                   label=name.replace("_"," "))
 
-def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
+def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False, filtering=None):
     """
     If an ARD kernel is present, plot a bar representation using matplotlib
 
@@ -51,6 +51,10 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
         title of the plot,
         pass '' to not print a title
         pass None for a generic title
+    :param filtering: list of names, which to use for plotting ARD parameters.
+                      Only kernels which match names in the list of names in filtering
+                      will be used for plotting.
+    :type filtering: list of names to use for ARD plot
     """
     fig, ax = ax_default(fignum,ax)
 
@@ -58,19 +62,25 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
         ax.set_title('ARD parameters, %s kernel' % kernel.name)
     else:
         ax.set_title(title)
-    
+
     Tango.reset()
     bars = []
-    
-    ard_params = np.atleast_2d(kernel.input_sensitivity())
+
+    ard_params = np.atleast_2d(kernel.input_sensitivity(summarize=False))
     bottom = 0
     x = np.arange(kernel.input_dim)
-    
+
+    if order is None:
+        order = kernel.parameter_names(recursive=False)
+
     for i in range(ard_params.shape[0]):
-        c = Tango.nextMedium()
-        bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel.parameters[i].name, bottom=bottom))
-        bottom += ard_params[i,:]
-    
+        if kernel.parameters[i].name in order:
+            c = Tango.nextMedium()
+            bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel.parameters[i].name, bottom=bottom))
+            bottom += ard_params[i,:]
+        else:
+            print "filtering out {}".format(kernel.parameters[i].name)
+
     ax.set_xlim(-.5, kernel.input_dim - .5)
     add_bar_labels(fig, ax, [bars[-1]], bottom=bottom-ard_params[i,:])
 

From 77a96efeba82d63d7b74d9e865927160223f6906 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 26 Aug 2014 16:38:00 +0100
Subject: [PATCH 13/14] improve numerical stability of vardtc_parallel

---
 GPy/core/parameterization/transformations.py               | 2 +-
 .../latent_function_inference/var_dtc_parallel.py          | 7 ++++---
 GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py                  | 6 +++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index df31f09f..53d4301d 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -54,7 +54,7 @@ class Transformation(object):
 class Logexp(Transformation):
     domain = _POSITIVE
     def f(self, x):
-        return np.where(x>_lim_val, x, np.log(1. + np.exp(np.clip(x, -_lim_val, _lim_val)))) + epsilon
+        return np.where(x>_lim_val, x, np.log1p(np.exp(np.clip(x, -_lim_val, _lim_val)))) + epsilon
         #raises overflow warning: return np.where(x>_lim_val, x, np.log(1. + np.exp(x)))
     def finv(self, f):
         return np.where(f>_lim_val, f, np.log(np.exp(f+1e-20) - 1.))
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 03f5d478..19fc0fa8 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -169,14 +169,15 @@ class VarDTC_minibatch(LatentFunctionInference):
         Kmm = kern.K(Z).copy()
         diag.add(Kmm, self.const_jitter)
         Lm = jitchol(Kmm)
-                
-        Lambda = Kmm+psi2_full
+        
+        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
+        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
         LL = jitchol(Lambda)
+        LL = np.dot(Lm,LL)
         b,_ = dtrtrs(LL, psi1Y_full.T)
         bbt = np.square(b).sum()
         v,_ = dtrtrs(LL.T,b,lower=False)
         vvt = np.einsum('md,od->mo',v,v)
-        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
         
         Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T
         LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0]
diff --git a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
index 73c2c33b..dda68bdf 100644
--- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
@@ -378,7 +378,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
         if self.GPU_direct:
             dL_dpsi1_gpu = dL_dpsi1
             dL_dpsi2_gpu = dL_dpsi2
-            dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
+            dL_dpsi0_sum = dL_dpsi0.get().sum() #gpuarray.sum(dL_dpsi0).get()
         else:
             dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
             dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
@@ -394,7 +394,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
         self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
         self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
 
-        dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
+        dL_dvar = dL_dpsi0_sum + dvar_gpu.get().sum()#gpuarray.sum(dvar_gpu).get()
         sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
         dL_dmu = grad_mu_gpu.get()
         sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
@@ -404,7 +404,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
             sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
             dL_dlengscale = grad_l_gpu.get()
         else:
-            dL_dlengscale = gpuarray.sum(dl_gpu).get()
+            dL_dlengscale = dl_gpu.get().sum() #gpuarray.sum(dl_gpu).get()
             
         return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
     

From a853d060fba8d00b9a1bc27c71709a3e3642cca7 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 26 Aug 2014 16:46:28 +0100
Subject: [PATCH 14/14] recover the ss_gplvm.py

---
 GPy/models/ss_gplvm.py | 355 +++++++++++------------------------------
 1 file changed, 92 insertions(+), 263 deletions(-)

diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index ba66eec3..ba793fc2 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -2,18 +2,14 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-import itertools
-from matplotlib import pyplot
 
 from ..core.sparse_gp import SparseGP
 from .. import kern
 from ..likelihoods import Gaussian
-from ..inference.optimization import SCG
-from ..util import linalg
 from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
-
+from ..kern._src.psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF_GPU
 
 class SSGPLVM(SparseGP):
     """
@@ -27,9 +23,13 @@ class SSGPLVM(SparseGP):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', group_spike=False, **kwargs):
+    def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, pi=None, learnPi=True, **kwargs):
 
+        self.mpi_comm = mpi_comm
+        self.__IN_OPTIMIZATION__ = False
+        self.group_spike = group_spike
+        
         if X == None:
             from ..util.initialization import initialize_latent
             X, fracs = initialize_latent(init, input_dim, Y)
@@ -40,49 +40,66 @@ class SSGPLVM(SparseGP):
 
         if X_variance is None: # The variance of the variational approximation (S)
             X_variance = np.random.uniform(0,.1,X.shape)
-
-        gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
-        gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
-
-        if group_spike:
-            gamma[:] = gamma.mean(axis=0)
-
+            
+        if Gamma is None:
+            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
+        else:
+            gamma = Gamma.copy()
+                
         if Z is None:
             Z = np.random.permutation(X.copy())[:num_inducing]
         assert Z.shape[1] == X.shape[1]
-
+        
         if likelihood is None:
             likelihood = Gaussian()
 
         if kernel is None:
             kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim)
+        if kernel.useGPU:
+            kernel.psicomp = PSICOMP_SSRBF_GPU()
+        
+        if inference_method is None:
+            inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
 
-        pi = np.empty((input_dim))
-        pi[:] = 0.5
-        self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b
-
-        X = np.asfortranarray(X)
-        X_variance = np.asfortranarray(X_variance)
-        gamma = np.asfortranarray(gamma)
+        if pi is None:
+            pi = np.empty((input_dim))
+            pi[:] = 0.5
+        self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi) # the prior probability of the latent binary variable b
+        
         X = SpikeAndSlabPosterior(X, X_variance, gamma)
-
-        if group_spike:
-            kernel.group_spike_prob = True
-            self.variational_prior.group_spike_prob = True
-
+        
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
         self.add_parameter(self.X, index=0)
         self.add_parameter(self.variational_prior)
-
+                
+        if mpi_comm != None:
+            from ..util.mpi import divide_data
+            N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm)
+            self.N_range = (N_start, N_end)
+            self.N_list = np.array(N_list)
+            self.Y_local = self.Y[N_start:N_end]
+            print 'MPI RANK: '+str(self.mpi_comm.rank)+' with datasize: '+str(self.N_range)
+            mpi_comm.Bcast(self.param_array, root=0)
+        
+        if self.group_spike:
+            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in xrange(self.X.gamma.shape[1])] # Tie columns together
+        
     def set_X_gradients(self, X, X_grad):
         """Set the gradients of the posterior distribution of X in its specific form."""
         X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad
+    
+    def get_X_gradients(self, X):
+        """Get the gradients of the posterior distribution of X in its specific form."""
+        return X.mean.gradient, X.variance.gradient, X.binary_prob.gradient
 
     def parameters_changed(self):
         if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
-            update_gradients(self)
+            update_gradients(self, mpi_comm=self.mpi_comm)
             return
-
+        
         super(SSGPLVM, self).parameters_changed()
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
 
@@ -91,7 +108,7 @@ class SSGPLVM(SparseGP):
         # update for the KL divergence
         self.variational_prior.update_gradients_KL(self.X)
 
-    def input_sensitivity(self, summarize=True):
+    def input_sensitivity(self):
         if self.kern.ARD:
             return self.kern.input_sensitivity()
         else:
@@ -104,235 +121,47 @@ class SSGPLVM(SparseGP):
 
         return dim_reduction_plots.plot_latent(self, plot_inducing=plot_inducing, *args, **kwargs)
 
-    def do_test_latents(self, Y):
-        """
-        Compute the latent representation for a set of new points Y
-
-        Notes:
-        This will only work with a univariate Gaussian likelihood (for now)
-        """
-        assert not self.likelihood.is_heteroscedastic
-        N_test = Y.shape[0]
-        input_dim = self.Z.shape[1]
-        means = np.zeros((N_test, input_dim))
-        covars = np.zeros((N_test, input_dim))
-
-        dpsi0 = -0.5 * self.output_dim * self.likelihood.precision
-        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
-        V = self.likelihood.precision * Y
-
-        #compute CPsi1V
-        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T, self.likelihood.V)
-            tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-            tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
-
-        dpsi1 = np.dot(self.Cpsi1V, V.T)
-
-        start = np.zeros(self.input_dim * 2)
-
-        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
-            args = (self.kern, self.Z, dpsi0, dpsi1_n.T, dpsi2)
-            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
-
-            mu, log_S = xopt.reshape(2, 1, -1)
-            means[n] = mu[0].copy()
-            covars[n] = np.exp(log_S[0]).copy()
-
-        return means, covars
-
-    def dmu_dX(self, Xnew):
-        """
-        Calculate the gradient of the prediction at Xnew w.r.t Xnew.
-        """
-        dmu_dX = np.zeros_like(Xnew)
-        for i in range(self.Z.shape[0]):
-            dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
-        return dmu_dX
-
-    def dmu_dXnew(self, Xnew):
-        """
-        Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
-        """
-        dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
-        ones = np.ones((1, 1))
-        for i in range(self.Z.shape[0]):
-            dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
-        return np.dot(dK_dX, self.Cpsi1Vf)
-
-    def plot_steepest_gradient_map(self, fignum=None, ax=None, which_indices=None, labels=None, data_labels=None, data_marker='o', data_s=40, resolution=20, aspect='auto', updates=False, ** kwargs):
-        input_1, input_2 = significant_dims = most_significant_input_dimensions(self, which_indices)
-
-        X = np.zeros((resolution ** 2, self.input_dim))
-        indices = np.r_[:X.shape[0]]
-        if labels is None:
-            labels = range(self.output_dim)
-
-        def plot_function(x):
-            X[:, significant_dims] = x
-            dmu_dX = self.dmu_dXnew(X)
-            argmax = np.argmax(dmu_dX, 1)
-            return dmu_dX[indices, argmax], np.array(labels)[argmax]
-
-        if ax is None:
-            fig = pyplot.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if data_labels is None:
-            data_labels = np.ones(self.num_data)
-        ulabels = []
-        for lab in data_labels:
-            if not lab in ulabels:
-                ulabels.append(lab)
-        marker = itertools.cycle(list(data_marker))
-        from GPy.util import Tango
-        for i, ul in enumerate(ulabels):
-            if type(ul) is np.string_:
-                this_label = ul
-            elif type(ul) is np.int64:
-                this_label = 'class %i' % ul
-            else:
-                this_label = 'class %i' % i
-            m = marker.next()
-            index = np.nonzero(data_labels == ul)[0]
-            x = self.X[index, input_1]
-            y = self.X[index, input_2]
-            ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
-
-        ax.set_xlabel('latent dimension %i' % input_1)
-        ax.set_ylabel('latent dimension %i' % input_2)
-
-        from matplotlib.cm import get_cmap
-        from GPy.util.latent_space_visualizations.controllers.imshow_controller import ImAnnotateController
-        if not 'cmap' in kwargs.keys():
-            kwargs.update(cmap=get_cmap('jet'))
-        controller = ImAnnotateController(ax,
-                                      plot_function,
-                                      tuple(self.X.min(0)[:, significant_dims]) + tuple(self.X.max(0)[:, significant_dims]),
-                                      resolution=resolution,
-                                      aspect=aspect,
-                                      **kwargs)
-        ax.legend()
-        ax.figure.tight_layout()
-        if updates:
-            pyplot.show()
-            clear = raw_input('Enter to continue')
-            if clear.lower() in 'yes' or clear == '':
-                controller.deactivate()
-        return controller.view
-
-    def plot_X_1d(self, fignum=None, ax=None, colors=None):
-        """
-        Plot latent space X in 1D:
-
-            - if fig is given, create input_dim subplots in fig and plot in these
-            - if ax is given plot input_dim 1D latent space plots of X into each `axis`
-            - if neither fig nor ax is given create a figure with fignum and plot in there
-
-        colors:
-            colors of different latent space dimensions input_dim
-
-        """
-        import pylab
-        if ax is None:
-            fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1]))))
-        if colors is None:
-            colors = pylab.gca()._get_lines.color_cycle
-            pylab.clf()
-        else:
-            colors = iter(colors)
-        plots = []
-        x = np.arange(self.X.shape[0])
-        for i in range(self.X.shape[1]):
-            if ax is None:
-                a = fig.add_subplot(self.X.shape[1], 1, i + 1)
-            elif isinstance(ax, (tuple, list)):
-                a = ax[i]
-            else:
-                raise ValueError("Need one ax per latent dimnesion input_dim")
-            a.plot(self.X, c='k', alpha=.3)
-            plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
-            a.fill_between(x,
-                            self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
-                            self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
-                            facecolor=plots[-1].get_color(),
-                            alpha=.3)
-            a.legend(borderaxespad=0.)
-            a.set_xlim(x.min(), x.max())
-            if i < self.X.shape[1] - 1:
-                a.set_xticklabels('')
-        pylab.draw()
-        if ax is None:
-            fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
-        return fig
-
-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return SparseGP._getstate(self) + [self.init]
-
-    def setstate(self, state):
-        self._const_jitter = None
-        self.init = state.pop()
-        SparseGP._setstate(self, state)
-
-
-def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    objective function for fitting the latent variables for test points
-    (negative log-likelihood: should be minimised!)
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    psi0 = kern.psi0(Z, mu, S)
-    psi1 = kern.psi1(Z, mu, S)
-    psi2 = kern.psi2(Z, mu, S)
-
-    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
-
-    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
-    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
-    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
-
-    dmu = mu0 + mu1 + mu2 - mu
-    # dS = S0 + S1 + S2 -0.5 + .5/S
-    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
-    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
-
-def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
-    This is the same as latent_cost_and_grad but only for the objective
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    psi0 = kern.psi0(Z, mu, S)
-    psi1 = kern.psi1(Z, mu, S)
-    psi2 = kern.psi2(Z, mu, S)
-
-    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
-    return -float(lik)
-
-def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    This is the same as latent_cost_and_grad but only for the grad
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
-    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
-    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
-
-    dmu = mu0 + mu1 + mu2 - mu
-    # dS = S0 + S1 + S2 -0.5 + .5/S
-    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
-
-    return -np.hstack((dmu.flatten(), dlnS.flatten()))
-
-
+    def __getstate__(self):
+        dc = super(SSGPLVM, self).__getstate__()
+        dc['mpi_comm'] = None
+        if self.mpi_comm != None:
+            del dc['N_range']
+            del dc['N_list']
+            del dc['Y_local']
+        return dc
+ 
+    def __setstate__(self, state):
+        return super(SSGPLVM, self).__setstate__(state)
+    
+    #=====================================================
+    # The MPI parallelization 
+    #     - can move to model at some point
+    #=====================================================
+    
+    def _set_params_transformed(self, p):
+        if self.mpi_comm != None:
+            if self.__IN_OPTIMIZATION__ and self.mpi_comm.rank==0:
+                self.mpi_comm.Bcast(np.int32(1),root=0)
+            self.mpi_comm.Bcast(p, root=0)
+        super(SSGPLVM, self)._set_params_transformed(p)
+        
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        self.__IN_OPTIMIZATION__ = True
+        if self.mpi_comm==None:
+            super(SSGPLVM, self).optimize(optimizer,start,**kwargs)
+        elif self.mpi_comm.rank==0:
+            super(SSGPLVM, self).optimize(optimizer,start,**kwargs)
+            self.mpi_comm.Bcast(np.int32(-1),root=0)
+        elif self.mpi_comm.rank>0:
+            x = self._get_params_transformed().copy()
+            flag = np.empty(1,dtype=np.int32)
+            while True:
+                self.mpi_comm.Bcast(flag,root=0)
+                if flag==1:
+                    self._set_params_transformed(x)
+                elif flag==-1:
+                    break
+                else:
+                    self.__IN_OPTIMIZATION__ = False
+                    raise Exception("Unrecognizable flag for synchronization!")
+        self.__IN_OPTIMIZATION__ = False