From 080e4ebd60c8f0b9caaf163983c952f00bcc21f5 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Fri, 11 Dec 2015 10:46:46 +0000
Subject: [PATCH 01/11] add the import of transformation __fixed__

---
 GPy/core/parameterization/transformations.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index 1799a06d..936c7a64 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from paramz.transformations import *
\ No newline at end of file
+from paramz.transformations import *
+from paramz.transformations import __fixed__
\ No newline at end of file

From 60b2a57ea81fb72c7d8cfbe22693d0f6a14c1b5d Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Thu, 14 Jan 2016 09:41:59 +0000
Subject: [PATCH 02/11] implement slvm

---
 GPy/models/ss_gplvm.py | 85 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 2c413ecd..c8ff1664 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -94,6 +94,86 @@ class IBPPrior(VariationalPrior):
         variational_posterior.tau.gradient[:,0] = -((tau[:,0]-gamma-ad)*polygamma(1,tau[:,0])+common)
         variational_posterior.tau.gradient[:,1] = -((tau[:,1]+gamma-2)*polygamma(1,tau[:,1])+common)
 
+class SLVMPosterior(SpikeAndSlabPosterior):
+    '''
+    The SpikeAndSlab distribution for variational approximations.
+    '''
+    def __init__(self, means, variances, binary_prob, tau=None,  name='latent space'):
+        """
+        binary_prob : the probability of the distribution on the slab part.
+        """
+        from paramz.transformations import Logexp
+        super(SLVMPosterior, self).__init__(means, variances, binary_prob, group_spike=False, name=name)
+        self.tau = Param("tau_", np.ones((self.gamma.shape[1],2)), Logexp())
+        self.link_parameter(self.tau)
+
+    def set_gradients(self, grad):
+        self.mean.gradient, self.variance.gradient, self.gamma.gradient, self.tau.gradient = grad
+
+    def __getitem__(self, s):
+        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
+            import copy
+            n = self.__new__(self.__class__, self.name)
+            dc = self.__dict__.copy()
+            dc['mean'] = self.mean[s]
+            dc['variance'] = self.variance[s]
+            dc['binary_prob'] = self.binary_prob[s]
+            dc['tau'] = self.tau
+            dc['parameters'] = copy.copy(self.parameters)
+            n.__dict__.update(dc)
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
+            n.parameters[dc['tau']._parent_index_] = dc['tau']
+            n._gradient_array_ = None
+            oversize = self.size - self.mean.size - self.variance.size - self.gamma.size - self.tau.size
+            n.size = n.mean.size + n.variance.size + n.gamma.size+ n.tau.size + oversize
+            n.ndim = n.mean.ndim
+            n.shape = n.mean.shape
+            n.num_data = n.mean.shape[0]
+            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
+            return n
+        else:
+            return super(IBPPosterior, self).__getitem__(s)
+
+class SLVMPrior(VariationalPrior):
+    def __init__(self, input_dim, alpha =1., beta=1., Z=None, name='SLVMPrior', **kw):
+        super(SLVMPrior, self).__init__(name=name, **kw)
+        self.input_dim = input_dim
+        self.variance = 1.
+        self.alpha = alpha
+        self.beta = beta
+        self.Z = Z
+        if Z is not None:
+            assert np.all(np.unique(Z)==np.array([0,1]))
+
+    def KL_divergence(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma.values, variational_posterior.tau.values
+        
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        part1 = (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
+        
+        from scipy.special import betaln,digamma            
+        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + betaln(self.alpha,self.beta)*self.input_dim \
+                -betaln(tau[:,0], tau[:,1]).sum() + ((tau[:,0]-(gamma*self.Z).sum(0)-self.alpha)*digamma(tau[:,0])).sum() + \
+                ((tau[:,1]-((1-gamma)*self.Z).sum(0)-self.beta)*digamma(tau[:,1])).sum() + ((self.Z.sum(0)+self.alpha+self.beta-tau[:,0]-tau[:,1])*digamma(tau.sum(axis=1))).sum()        
+
+        return part1+part2
+
+    def update_gradients_KL(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma.values, variational_posterior.tau.values
+
+        variational_posterior.mean.gradient -= gamma*mu/self.variance
+        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * gamma /2.
+        from scipy.special import digamma,polygamma
+        dgamma = np.log(gamma/(1.-gamma))+ (digamma(tau[:,1])-digamma(tau[:,0]))*self.Z
+        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        common = (self.Z.sum(0)+self.alpha+self.beta-tau[:,0]-tau[:,1])*polygamma(1,tau.sum(axis=1))
+        variational_posterior.tau.gradient[:,0] = -((tau[:,0]-(gamma*self.Z).sum(0)-self.alpha)*polygamma(1,tau[:,0])+common)
+        variational_posterior.tau.gradient[:,1] = -((tau[:,1]-((1-gamma)*self.Z).sum(0)-self.beta)*polygamma(1,tau[:,1])+common)
+
+
 class SSGPLVM(SparseGP_MPI):
     """
     Spike-and-Slab Gaussian Process Latent Variable Model
@@ -107,7 +187,7 @@ class SSGPLVM(SparseGP_MPI):
 
     """
     def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False, alpha=2., tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, sharedX=False, variational_prior=None,**kwargs):
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False,SLVM=False, alpha=2., beta=2., connM=None, tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, sharedX=False, variational_prior=None,**kwargs):
 
         self.group_spike = group_spike
         self.init = init
@@ -152,6 +232,9 @@ class SSGPLVM(SparseGP_MPI):
         if IBP:
             self.variational_prior = IBPPrior(input_dim=input_dim, alpha=alpha) if variational_prior is None else variational_prior
             X = IBPPosterior(X, X_variance, gamma, tau=tau,sharedX=sharedX)
+        elif SLVM:
+            self.variational_prior = SLVMPrior(input_dim=input_dim, alpha=alpha, beta=beta, Z=connM) if variational_prior is None else variational_prior
+            X = SLVMPosterior(X, X_variance, gamma, tau=tau)
         else:
             self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi, group_spike=group_spike)  if variational_prior is None else variational_prior
             X = SpikeAndSlabPosterior(X, X_variance, gamma, group_spike=group_spike,sharedX=sharedX)

From 5f417565fb900176fe275eaefc81f0f91bf06e59 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Thu, 14 Jan 2016 15:56:48 +0000
Subject: [PATCH 03/11] slvm gamma mean-field

---
 GPy/models/ss_gplvm.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index c8ff1664..4bc9a173 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -104,7 +104,7 @@ class SLVMPosterior(SpikeAndSlabPosterior):
         """
         from paramz.transformations import Logexp
         super(SLVMPosterior, self).__init__(means, variances, binary_prob, group_spike=False, name=name)
-        self.tau = Param("tau_", np.ones((self.gamma.shape[1],2)), Logexp())
+        self.tau = Param("tau_", np.ones((self.gamma.shape[1],2))*2, Logexp())
         self.link_parameter(self.tau)
 
     def set_gradients(self, grad):
@@ -152,7 +152,7 @@ class SLVMPrior(VariationalPrior):
         
         var_mean = np.square(mu)/self.variance
         var_S = (S/self.variance - np.log(S))
-        part1 = (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
+        part1 = ((np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
         
         from scipy.special import betaln,digamma            
         part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + betaln(self.alpha,self.beta)*self.input_dim \
@@ -164,11 +164,11 @@ class SLVMPrior(VariationalPrior):
     def update_gradients_KL(self, variational_posterior):
         mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma.values, variational_posterior.tau.values
 
-        variational_posterior.mean.gradient -= gamma*mu/self.variance
-        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * gamma /2.
+        variational_posterior.mean.gradient -= mu/self.variance
+        variational_posterior.variance.gradient -= (1./self.variance - 1./S)  /2.
         from scipy.special import digamma,polygamma
         dgamma = np.log(gamma/(1.-gamma))+ (digamma(tau[:,1])-digamma(tau[:,0]))*self.Z
-        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        variational_posterior.binary_prob.gradient -= dgamma
         common = (self.Z.sum(0)+self.alpha+self.beta-tau[:,0]-tau[:,1])*polygamma(1,tau.sum(axis=1))
         variational_posterior.tau.gradient[:,0] = -((tau[:,0]-(gamma*self.Z).sum(0)-self.alpha)*polygamma(1,tau[:,0])+common)
         variational_posterior.tau.gradient[:,1] = -((tau[:,1]-((1-gamma)*self.Z).sum(0)-self.beta)*polygamma(1,tau[:,1])+common)

From 8b279175c5894b02c11f694a35e57490ae69acfa Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Thu, 21 Jan 2016 11:22:57 +0000
Subject: [PATCH 04/11] fix the issue of negative prediction variance of normal
 GP

---
 GPy/core/gp.py                                |  6 +++
 GPy/core/sparse_gp.py                         |  7 +++-
 .../exact_gaussian_inference.py               |  3 +-
 .../latent_function_inference/posterior.py    | 34 ++++++++++++++++-
 GPy/testing/inference_tests.py                |  1 -
 GPy/testing/model_tests.py                    | 38 +++++++++++++++++++
 6 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index ae710355..ac08677d 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -212,6 +212,12 @@ class GP(Model):
                         = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
             \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
         """
+        if hasattr(self.posterior, '_raw_predict'):
+            mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
+            if self.mean_function is not None:
+                mu += self.mean_function.f(Xnew)
+            return mu, var
+            
         if kern is None:
             kern = self.kern
 
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index d71eecc3..a0f50ce9 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -126,7 +126,12 @@ class SparseGP(GP):
         The implementation of that will follow. However, for each dimension the
         covariance changes, so if full_cov is False (standard), we return the variance
         for each dimension [NxD].
-        """
+        """        
+        if hasattr(self.posterior, '_raw_predict'):
+            mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
+            if self.mean_function is not None:
+                mu += self.mean_function.f(Xnew)
+            return mu, var
 
         if kern is None: kern = self.kern
 
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 0ab85586..74f66fe6 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -1,14 +1,13 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from .posterior import Posterior
+from .posterior import PosteriorExact as Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
-
 class ExactGaussianInference(LatentFunctionInference):
     """
     An object for inference when the likelihood is Gaussian.
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index fbd72f57..c9b8f492 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol
+from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol, dtrtrs, tdot
 
 class Posterior(object):
     """
@@ -187,3 +187,35 @@ class Posterior(object):
         if self._K_chol is None:
             self._K_chol = jitchol(self._K)
         return self._K_chol
+    
+class PosteriorExact(Posterior):
+     
+    def _raw_predict(self, kern, Xnew, pred_var, full_cov=False):
+        
+        Kx = kern.K(pred_var, Xnew)
+        mu = np.dot(Kx.T, self.woodbury_vector)
+        if len(mu.shape)==1:
+            mu = mu.reshape(-1,1)
+        if full_cov:
+            Kxx = kern.K(Xnew)
+            if self._woodbury_chol.ndim == 2:
+                tmp = dtrtrs(self._woodbury_chol, Kx)[0]
+                var = Kxx - tdot(tmp.T)
+            elif self._woodbury_chol.ndim == 3: # Missing data
+                var = np.empty((Kxx.shape[0],Kxx.shape[1],self._woodbury_chol.shape[2]))
+                for i in range(var.shape[2]):
+                    tmp = dtrtrs(self._woodbury_chol[:,:,i], Kx)[0]
+                    var[:, :, i] = (Kxx - tdot(tmp.T))
+            var = var
+        else:
+            Kxx = kern.Kdiag(Xnew)
+            if self._woodbury_chol.ndim == 2:
+                tmp = dtrtrs(self._woodbury_chol, Kx)[0]
+                var = (Kxx - np.square(tmp).sum(0))[:,None]
+            elif self._woodbury_chol.ndim == 3: # Missing data
+                var = np.empty((Kxx.shape[0],self._woodbury_chol.shape[2]))
+                for i in range(var.shape[1]):
+                    tmp = dtrtrs(self._woodbury_chol[:,:,i], Kx)[0]
+                    var[:, i] = (Kxx - np.square(tmp).sum(0))
+            var = var
+        return mu, var
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index 7a091589..ebda08c6 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -50,6 +50,5 @@ class InferenceXTestCase(unittest.TestCase):
         x, mi = m.infer_newX(m.Y, optimize=True)
         np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
 
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 1212d746..009b4848 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -22,6 +22,44 @@ class MiscTests(unittest.TestCase):
         self.assertTrue(m.checkgrad())
         m.predict(m.X)
 
+    def test_raw_predict_numerical_stability(self):
+        """
+        Test whether the predicted variance of normal GP goes negative under numerical unstable situation.
+        Thanks simbartonels@github for reporting the bug and providing the following example.
+        """
+        
+        # set seed for reproducability
+        np.random.seed(3)
+        # Definition of the Branin test function
+        def branin(X):
+            y = (X[:,1]-5.1/(4*np.pi**2)*X[:,0]**2+5*X[:,0]/np.pi-6)**2
+            y += 10*(1-1/(8*np.pi))*np.cos(X[:,0])+10
+            return(y)
+        # Training set defined as a 5*5 grid:
+        xg1 = np.linspace(-5,10,5)
+        xg2 = np.linspace(0,15,5)
+        X = np.zeros((xg1.size * xg2.size,2))
+        for i,x1 in enumerate(xg1):
+            for j,x2 in enumerate(xg2):
+                X[i+xg1.size*j,:] = [x1,x2]
+        Y = branin(X)[:,None]
+        # Fit a GP
+        # Create an exponentiated quadratic plus bias covariance function
+        k = GPy.kern.RBF(input_dim=2, ARD = True)
+        # Build a GP model
+        m = GPy.models.GPRegression(X,Y,k)
+        # fix the noise variance
+        m.likelihood.variance.fix(1e-5)
+        # Randomize the model and optimize
+        m.randomize()
+        m.optimize()
+        # Compute the mean of model prediction on 1e5 Monte Carlo samples
+        Xp = np.random.uniform(size=(1e5,2))
+        Xp[:,0] = Xp[:,0]*15-5
+        Xp[:,1] = Xp[:,1]*15
+        _, var = m.predict(Xp)
+        self.assertTrue(np.all(var>=0.))
+
     def test_raw_predict(self):
         k = GPy.kern.RBF(1)
         m = GPy.models.GPRegression(self.X, self.Y, kernel=k)

From 0dd52981d0398a4e325b733611510e54818698ca Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 9 Feb 2016 16:48:23 +0000
Subject: [PATCH 05/11] get rid of mpi4py import

---
 GPy/util/gpu_init.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/GPy/util/gpu_init.py b/GPy/util/gpu_init.py
index 0c496db3..94763d8b 100644
--- a/GPy/util/gpu_init.py
+++ b/GPy/util/gpu_init.py
@@ -10,15 +10,12 @@ gpu_device = None
 gpu_context = None
 MPI_enabled = False
 
-try:
-    from mpi4py import MPI
-    MPI_enabled = True
-except:
-    pass
-
-
-
 def initGPU():
+    try:
+        from mpi4py import MPI
+        MPI_enabled = True
+    except:
+        pass
     try:
         if MPI_enabled and MPI.COMM_WORLD.size>1:
             from .parallel import get_id_within_node

From a77a6755495fe3871ca9aa53dd107554f26cc01b Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 9 Feb 2016 17:22:38 +0000
Subject: [PATCH 06/11] fix gpu initialziation

---
 GPy/kern/src/psi_comp/rbf_psi_gpucomp.py   |  3 --
 GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py |  2 -
 GPy/util/gpu_init.py                       | 52 ++++++++++++----------
 3 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
index baab83ec..e9268a3e 100644
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@@ -5,7 +5,6 @@ The module for psi-statistics for RBF kernel
 import numpy as np
 from paramz.caching import Cache_this
 from . import PSICOMP_RBF
-from ....util import gpu_init
 
 gpu_code = """
     // define THREADNUM
@@ -238,8 +237,6 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
         self.fall_back = PSICOMP_RBF()
         
         from pycuda.compiler import SourceModule
-        from ....util.gpu_init import initGPU
-        initGPU()
         
         self.GPU_direct = GPU_direct
         self.gpuCache = None
diff --git a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
index 844f944e..62876785 100644
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@@ -287,8 +287,6 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
     def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
         
         from pycuda.compiler import SourceModule
-        from ....util.gpu_init import initGPU
-        initGPU()
         
         self.GPU_direct = GPU_direct
         self.gpuCache = None
diff --git a/GPy/util/gpu_init.py b/GPy/util/gpu_init.py
index 94763d8b..19339b91 100644
--- a/GPy/util/gpu_init.py
+++ b/GPy/util/gpu_init.py
@@ -10,29 +10,35 @@ gpu_device = None
 gpu_context = None
 MPI_enabled = False
 
-def initGPU():
-    try:
-        from mpi4py import MPI
-        MPI_enabled = True
-    except:
-        pass
-    try:
-        if MPI_enabled and MPI.COMM_WORLD.size>1:
-            from .parallel import get_id_within_node
-            gpuid = get_id_within_node()
-            import pycuda.driver
-            pycuda.driver.init()
-            if gpuid>=pycuda.driver.Device.count():
-                print('['+MPI.Get_processor_name()+'] more processes than the GPU numbers!')
-                raise
-            gpu_device = pycuda.driver.Device(gpuid)
-            gpu_context = gpu_device.make_context()
-            gpu_initialized = True
-        else:
-            import pycuda.autoinit
-            gpu_initialized = True
-    except:
-        pass
+try:
+    import pycuda.autoinit
+    gpu_initialized = True
+except:
+    pass
+
+# def initGPU():
+#     try:
+#         from mpi4py import MPI
+#         MPI_enabled = True
+#     except:
+#         pass
+#     try:
+#         if MPI_enabled and MPI.COMM_WORLD.size>1:
+#             from .parallel import get_id_within_node
+#             gpuid = get_id_within_node()
+#             import pycuda.driver
+#             pycuda.driver.init()
+#             if gpuid>=pycuda.driver.Device.count():
+#                 print('['+MPI.Get_processor_name()+'] more processes than the GPU numbers!')
+#                 raise
+#             gpu_device = pycuda.driver.Device(gpuid)
+#             gpu_context = gpu_device.make_context()
+#             gpu_initialized = True
+#         else:
+#             import pycuda.autoinit
+#             gpu_initialized = True
+#     except:
+#         pass
     
 def closeGPU():
     if gpu_context is not None:

From 3917d9cdbafb4c3a793a27709fb6dab94e6ed2e4 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 9 Feb 2016 17:25:25 +0000
Subject: [PATCH 07/11] fix gpu initialziation

---
 GPy/kern/src/psi_comp/rbf_psi_gpucomp.py   | 1 +
 GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
index e9268a3e..4eda1b32 100644
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@@ -237,6 +237,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
         self.fall_back = PSICOMP_RBF()
         
         from pycuda.compiler import SourceModule
+        import GPy.util.gpu_init
         
         self.GPU_direct = GPU_direct
         self.gpuCache = None
diff --git a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
index 62876785..04063dcc 100644
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@@ -7,7 +7,6 @@ import numpy as np
 from paramz.caching import Cache_this
 from . import PSICOMP_RBF
 
-
 gpu_code = """
     // define THREADNUM
 
@@ -287,6 +286,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
     def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
         
         from pycuda.compiler import SourceModule
+        import GPy.util.gpu_init
         
         self.GPU_direct = GPU_direct
         self.gpuCache = None

From 23b929c7e46a95c2e7faaad82ed7b1921ae4edd5 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 22 Feb 2016 20:56:39 +0000
Subject: [PATCH 08/11] fallback original slvm kl divergence

---
 GPy/models/ss_gplvm.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 4bc9a173..c8ff1664 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -104,7 +104,7 @@ class SLVMPosterior(SpikeAndSlabPosterior):
         """
         from paramz.transformations import Logexp
         super(SLVMPosterior, self).__init__(means, variances, binary_prob, group_spike=False, name=name)
-        self.tau = Param("tau_", np.ones((self.gamma.shape[1],2))*2, Logexp())
+        self.tau = Param("tau_", np.ones((self.gamma.shape[1],2)), Logexp())
         self.link_parameter(self.tau)
 
     def set_gradients(self, grad):
@@ -152,7 +152,7 @@ class SLVMPrior(VariationalPrior):
         
         var_mean = np.square(mu)/self.variance
         var_S = (S/self.variance - np.log(S))
-        part1 = ((np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
+        part1 = (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
         
         from scipy.special import betaln,digamma            
         part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + betaln(self.alpha,self.beta)*self.input_dim \
@@ -164,11 +164,11 @@ class SLVMPrior(VariationalPrior):
     def update_gradients_KL(self, variational_posterior):
         mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma.values, variational_posterior.tau.values
 
-        variational_posterior.mean.gradient -= mu/self.variance
-        variational_posterior.variance.gradient -= (1./self.variance - 1./S)  /2.
+        variational_posterior.mean.gradient -= gamma*mu/self.variance
+        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * gamma /2.
         from scipy.special import digamma,polygamma
         dgamma = np.log(gamma/(1.-gamma))+ (digamma(tau[:,1])-digamma(tau[:,0]))*self.Z
-        variational_posterior.binary_prob.gradient -= dgamma
+        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
         common = (self.Z.sum(0)+self.alpha+self.beta-tau[:,0]-tau[:,1])*polygamma(1,tau.sum(axis=1))
         variational_posterior.tau.gradient[:,0] = -((tau[:,0]-(gamma*self.Z).sum(0)-self.alpha)*polygamma(1,tau[:,0])+common)
         variational_posterior.tau.gradient[:,1] = -((tau[:,1]-((1-gamma)*self.Z).sum(0)-self.beta)*polygamma(1,tau[:,1])+common)

From e9cc56e8e8720e895797b9a20f5ea3a30177da9f Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Thu, 10 Mar 2016 18:14:25 +0000
Subject: [PATCH 09/11] add ssgplvm model test

---
 GPy/testing/model_tests.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 009b4848..8ff06e65 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -704,7 +704,19 @@ class GradientTests(np.testing.TestCase):
         lik = GPy.likelihoods.Gaussian()
         m = GPy.models.GPVariationalGaussianApproximation(X, Y, kernel=kern, likelihood=lik)
         self.assertTrue(m.checkgrad())
-
+        
+    def test_ssgplvm(self):
+        from GPy import kern
+        from GPy.models import SSGPLVM
+        from GPy.examples.dimensionality_reduction import _simulate_matern
+    
+        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
+        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
+        Y = Ylist[0]
+        k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+        # k = kern.RBF(Q, ARD=True, lengthscale=10.)
+        m = SSGPLVM(Y, Q, init="rand", num_inducing=num_inducing, kernel=k, group_spike=True)
+        self.assertTrue(m.checkgrad())
 
 if __name__ == "__main__":
     print("Running unit tests, please be (very) patient...")

From f2b813551a7dcde7b522d3486d9bef4219039015 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Thu, 10 Mar 2016 18:37:53 +0000
Subject: [PATCH 10/11] bug fix for mcmc sampler and add test case

---
 GPy/inference/mcmc/samplers.py |  8 +++-----
 GPy/testing/inference_tests.py | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/GPy/inference/mcmc/samplers.py b/GPy/inference/mcmc/samplers.py
index 5ec684cc..d2f11a49 100644
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@@ -37,16 +37,14 @@ class Metropolis_Hastings(object):
 
     def sample(self, Ntotal=10000, Nburn=1000, Nthin=10, tune=True, tune_throughout=False, tune_interval=400):
         current = self.model.optimizer_array
-        fcurrent = self.model.log_likelihood() + self.model.log_prior() + \
-                   self.model._log_det_jacobian()
+        fcurrent = self.model.log_likelihood() + self.model.log_prior() 
         accepted = np.zeros(Ntotal,dtype=np.bool)
         for it in range(Ntotal):
-            print("sample %d of %d\r"%(it,Ntotal),end="\t")
+            print("sample %d of %d\r"%(it+1,Ntotal),end="")
             sys.stdout.flush()
             prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
             self.model.optimizer_array = prop
-            fprop = self.model.log_likelihood() + self.model.log_prior() + \
-                    self.model._log_det_jacobian()
+            fprop = self.model.log_likelihood() + self.model.log_prior() 
 
             if fprop>fcurrent:#sample accepted, going 'uphill'
                 accepted[it] = True
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index 33ec3258..4bd2bc4f 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -64,6 +64,21 @@ class HMCSamplerTest(unittest.TestCase):
 
         hmc = GPy.inference.mcmc.HMC(m,stepsize=1e-2)
         s = hmc.sample(num_samples=3)
+        
+class MCMCSamplerTest(unittest.TestCase):
+
+    def test_sampling(self):
+        np.random.seed(1)
+        x = np.linspace(0.,2*np.pi,100)[:,None]
+        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
+
+        m = GPy.models.GPRegression(x,y)
+        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
+        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
+        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
+
+        mcmc = GPy.inference.mcmc.Metropolis_Hastings(m)
+        mcmc.sample(Ntotal=100, Nburn=10)
 
 if __name__ == "__main__":
     unittest.main()

From 67ba9b60c668bb2018672bb590f5509719b6eba1 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Thu, 17 Mar 2016 11:29:33 +0000
Subject: [PATCH 11/11] move _raw_predict into posterior object

---
 GPy/core/gp.py                                | 35 +-------
 GPy/core/sparse_gp.py                         | 87 -------------------
 .../latent_function_inference/posterior.py    | 51 +++++++++++
 GPy/testing/model_tests.py                    |  1 +
 4 files changed, 53 insertions(+), 121 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 0b87b1d5..d9db66ef 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -212,42 +212,9 @@ class GP(Model):
                         = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
             \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
         """
-        if hasattr(self.posterior, '_raw_predict'):
-            mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
-            if self.mean_function is not None:
-                mu += self.mean_function.f(Xnew)
-            return mu, var
-            
-        if kern is None:
-            kern = self.kern
-
-        Kx = kern.K(self._predictive_variable, Xnew)
-        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-        if len(mu.shape)==1:
-            mu = mu.reshape(-1,1)
-        if full_cov:
-            Kxx = kern.K(Xnew)
-            if self.posterior.woodbury_inv.ndim == 2:
-                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
-                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
-                from ..util.linalg import mdot
-                for i in range(var.shape[2]):
-                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
-            var = var
-        else:
-            Kxx = kern.Kdiag(Xnew)
-            if self.posterior.woodbury_inv.ndim == 2:
-                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
-            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
-                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
-                for i in range(var.shape[1]):
-                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
-            var = var
-        #add in the mean function
+        mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
         if self.mean_function is not None:
             mu += self.mean_function.f(Xnew)
-
         return mu, var
 
     def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, likelihood=None):
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 488b7026..7c0c1d18 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -113,90 +113,3 @@ class SparseGP(GP):
             self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
         self._Zgrad = self.Z.gradient.copy()
 
-
-    def _raw_predict(self, Xnew, full_cov=False, kern=None):
-        """
-        Make a prediction for the latent function values.
-
-        For certain inputs we give back a full_cov of shape NxN,
-        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
-        we take only the diagonal elements across N.
-
-        For uncertain inputs, the SparseGP bound produces cannot predict the full covariance matrix full_cov for now.
-        The implementation of that will follow. However, for each dimension the
-        covariance changes, so if full_cov is False (standard), we return the variance
-        for each dimension [NxD].
-        """        
-        if hasattr(self.posterior, '_raw_predict'):
-            mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
-            if self.mean_function is not None:
-                mu += self.mean_function.f(Xnew)
-            return mu, var
-
-        if kern is None: kern = self.kern
-
-        if not isinstance(Xnew, VariationalPosterior):
-            # Kx = kern.K(self._predictive_variable, Xnew)
-            # mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            # if full_cov:
-            #     Kxx = kern.K(Xnew)
-            #     if self.posterior.woodbury_inv.ndim == 2:
-            #         var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-            #     elif self.posterior.woodbury_inv.ndim == 3:
-            #         var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
-            #         for i in range(var.shape[2]):
-            #             var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
-            #     var = var
-            # else:
-            #     Kxx = kern.Kdiag(Xnew)
-            #     if self.posterior.woodbury_inv.ndim == 2:
-            #         var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
-            #     elif self.posterior.woodbury_inv.ndim == 3:
-            #         var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
-            #         for i in range(var.shape[1]):
-            #             var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
-            #     var = var
-            # #add in the mean function
-            # if self.mean_function is not None:
-            #     mu += self.mean_function.f(Xnew)
-            mu, var = super(SparseGP, self)._raw_predict(Xnew, full_cov, kern)
-        else:
-            psi0_star = kern.psi0(self._predictive_variable, Xnew)
-            psi1_star = kern.psi1(self._predictive_variable, Xnew)
-            psi2_star = kern.psi2n(self._predictive_variable, Xnew) 
-            la = self.posterior.woodbury_vector
-            mu = np.dot(psi1_star, la) # TODO: dimensions?
-            N,M,D = psi0_star.shape[0],psi1_star.shape[1], la.shape[1]
-
-            if full_cov:
-                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
-                var = np.zeros((Xnew.shape[0], la.shape[1], la.shape[1]))
-                di = np.diag_indices(la.shape[1])
-            else:
-                tmp = psi2_star - psi1_star[:,:,None]*psi1_star[:,None,:]
-                var = (tmp.reshape(-1,M).dot(la).reshape(N,M,D)*la[None,:,:]).sum(1) + psi0_star[:,None] 
-                if self.posterior.woodbury_inv.ndim==2:
-                    var += -psi2_star.reshape(N,-1).dot(self.posterior.woodbury_inv.flat)[:,None]
-                else:
-                    var += -psi2_star.reshape(N,-1).dot(self.posterior.woodbury_inv.reshape(-1,D))
-            assert np.all(var>=-1e-5), "The predicted variance goes negative!: "+str(var)
-            var = np.clip(var,1e-15,np.inf)
-
-#             for i in range(Xnew.shape[0]):
-#                 _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
-#                 psi2_star = kern.psi2(self._predictive_variable, NormalPosterior(_mu, _var))
-#                 tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
-# 
-#                 var_ = mdot(la.T, tmp, la)
-#                 p0 = psi0_star[i]
-#                 t = np.atleast_3d(self.posterior.woodbury_inv)
-#                 t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
-# 
-#                 if full_cov:
-#                     var_[di] += p0
-#                     var_[di] += -t2
-#                     var[i] = var_
-#                 else:
-#                     var[i] = np.diag(var_)+p0-t2
-
-        return mu, var
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index c9b8f492..73b9dff0 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol, dtrtrs, tdot
+from GPy.core.parameterization.variational import VariationalPosterior
 
 class Posterior(object):
     """
@@ -187,6 +188,56 @@ class Posterior(object):
         if self._K_chol is None:
             self._K_chol = jitchol(self._K)
         return self._K_chol
+
+    def _raw_predict(self, kern, Xnew, pred_var, full_cov=False):
+        woodbury_vector = self.woodbury_vector
+        woodbury_inv = self.woodbury_inv
+        
+        if not isinstance(Xnew, VariationalPosterior):
+            Kx = kern.K(pred_var, Xnew)
+            mu = np.dot(Kx.T, woodbury_vector)
+            if len(mu.shape)==1:
+                mu = mu.reshape(-1,1)
+            if full_cov:
+                Kxx = kern.K(Xnew)
+                if woodbury_inv.ndim == 2:
+                    var = Kxx - np.dot(Kx.T, np.dot(woodbury_inv, Kx))
+                elif woodbury_inv.ndim == 3: # Missing data
+                    var = np.empty((Kxx.shape[0],Kxx.shape[1],woodbury_inv.shape[2]))
+                    from ...util.linalg import mdot
+                    for i in range(var.shape[2]):
+                        var[:, :, i] = (Kxx - mdot(Kx.T, woodbury_inv[:, :, i], Kx))
+                var = var
+            else:
+                Kxx = kern.Kdiag(Xnew)
+                if woodbury_inv.ndim == 2:
+                    var = (Kxx - np.sum(np.dot(woodbury_inv.T, Kx) * Kx, 0))[:,None]
+                elif woodbury_inv.ndim == 3: # Missing data
+                    var = np.empty((Kxx.shape[0],woodbury_inv.shape[2]))
+                    for i in range(var.shape[1]):
+                        var[:, i] = (Kxx - (np.sum(np.dot(woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+                var = var
+        else:
+            psi0_star = kern.psi0(pred_var, Xnew)
+            psi1_star = kern.psi1(pred_var, Xnew)
+            psi2_star = kern.psi2n(pred_var, Xnew) 
+            la = woodbury_vector
+            mu = np.dot(psi1_star, la) # TODO: dimensions?
+            N,M,D = psi0_star.shape[0],psi1_star.shape[1], la.shape[1]
+
+            if full_cov:
+                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
+                var = np.zeros((Xnew.shape[0], la.shape[1], la.shape[1]))
+                di = np.diag_indices(la.shape[1])
+            else:
+                tmp = psi2_star - psi1_star[:,:,None]*psi1_star[:,None,:]
+                var = (tmp.reshape(-1,M).dot(la).reshape(N,M,D)*la[None,:,:]).sum(1) + psi0_star[:,None] 
+                if woodbury_inv.ndim==2:
+                    var += -psi2_star.reshape(N,-1).dot(woodbury_inv.flat)[:,None]
+                else:
+                    var += -psi2_star.reshape(N,-1).dot(woodbury_inv.reshape(-1,D))
+        var = np.clip(var,1e-15,np.inf)
+        return mu, var
     
 class PosteriorExact(Posterior):
      
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 55e5309c..e83fb993 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -157,6 +157,7 @@ class MiscTests(unittest.TestCase):
         # Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
         Kinv = m.posterior.woodbury_inv
         K_hat = k.K(self.X_new) - k.K(self.X_new, Z).dot(Kinv).dot(k.K(Z, self.X_new))
+        K_hat = np.clip(K_hat, 1e-15, np.inf)
 
         mu, covar = m._raw_predict(self.X_new, full_cov=True)
         self.assertEquals(mu.shape, (self.N_new, self.D))