diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 00a80c7b..7677fea2 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -58,11 +58,33 @@ class SparseGP(GP):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y)
         self.likelihood.update_gradients(self.grad_dict.pop('partial_for_likelihood'))
         if isinstance(self.X, VariationalPosterior):
-            self.kern.update_gradients_variational(posterior_variational=self.X, Z=self.Z, **self.grad_dict)
-            self.Z.gradient = self.kern.gradients_Z_variational(posterior_variational=self.X, Z=self.Z, **self.grad_dict)
+            #gradients wrt kernel
+            dL_dKmm = self.grad_dict.pop('dL_dKmm')
+            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
+            target = np.zeros(self.kern.size)
+            self.kern._collect_gradient(target)
+            self.kern.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, **self.grad_dict)
+            self.kern._collect_gradient(target)
+            self.kern._set_gradient(target)
+
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
+            self.Z.gradient += self.kern.gradients_Z_expectations(
+                               self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpis2'], Z=self.Z, variational_posterior=self.X)
         else:
-            self.kern.update_gradients_sparse(X=self.X, Z=self.Z, **self.grad_dict)
-            self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
+            #gradients wrt kernel
+            target = np.zeros(self.kern.size)
+            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
+            self.kern._collect_gradient(target)
+            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
+            self.kern._collect_gradient(target)
+            self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None)
+            self.kern._collect_gradient(target)
+            self.kern._set_gradient(target)
+
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
 
     def _raw_predict(self, Xnew, X_variance_new=None, full_cov=False):
         """
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 1022124d..3e52d5af 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -101,7 +101,7 @@ class Add(Kern):
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
         return psi2
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z):
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         from white import White
         from rbf import RBF
         #from rbf_inv import RBFInv
@@ -124,10 +124,10 @@ class Add(Kern):
                     eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2.
 
 
-            p1.update_gradients_variational(dL_dKmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
+            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
 
 
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         from white import White
         from rbf import RBF
         #from rbf_inv import rbfinv
@@ -151,10 +151,10 @@ class Add(Kern):
                     eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2.
 
 
-            target += p1.gradients_z_variational(dL_dKmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
+            target += p1.gradients_z_variational(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
         return target
 
-    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         from white import white
         from rbf import rbf
         #from rbf_inv import rbfinv
@@ -179,7 +179,7 @@ class Add(Kern):
                     eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(z[:,is2], mu[:,is2], s[:,is2]) * 2.
 
 
-            a, b = p1.gradients_muS_variational(dL_dkmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1])
+            a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1])
             target_mu += a
             target_S += b
         return target_mu, target_S
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 6b23a69e..2e412688 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -39,28 +39,21 @@ class Kern(Parameterized):
     def update_gradients_full(self, dL_dK, X, X2):
         """Set the gradients of all parameters when doing full (N) inference."""
         raise NotImplementedError
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        target = np.zeros(self.size)
-        self.update_gradients_diag(dL_dKdiag, X)
-        self._collect_gradient(target)
-        self.update_gradients_full(dL_dKnm, X, Z)
-        self._collect_gradient(target)
-        self.update_gradients_full(dL_dKmm, Z, None)
-        self._collect_gradient(target)
-        self._set_gradient(target)
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Set the gradients of all parameters when doing inference with
+        uncertain inputs, using expectations of the kernel.
+        """
+        raise NotImplementedError
+    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        raise NotImplementedError
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Compute the gradients wrt the parameters of the variational
+        distruibution q(X), chain-ruling via the expectations of the kernel
+        """
+        raise NotImplementedError
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
-        raise NotImplementedError
-    def gradients_Z_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        grad = self.gradients_X(dL_dKmm, Z)
-        grad += self.gradients_X(dL_dKnm.T, Z, X)
-        return grad
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        raise NotImplementedError
-    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        raise NotImplementedError
-    
     def plot_ARD(self, *args, **kw):
         if "matplotlib" in sys.modules:
             from ...plotting.matplot_dep import kernel_plots
@@ -68,13 +61,13 @@ class Kern(Parameterized):
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ...plotting.matplot_dep import kernel_plots
         return kernel_plots.plot_ARD(self,*args,**kw)
-    
+
     def input_sensitivity(self):
         """
         Returns the sensitivity for each dimension of this kernel.
         """
         return np.zeros(self.input_dim)
-    
+
     def __add__(self, other):
         """ Overloading of the '+' operator. for more control, see self.add """
         return self.add(other)
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 1d4f4611..e503180a 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -117,7 +117,7 @@ class Linear(Kern):
         ZAinner = self._ZAinner(variational_posterior, Z)
         return np.dot(ZAinner, ZA.T)
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z):
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         mu, S = variational_posterior.mean, variational_posterior.variance
         # psi0:
         tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior)
@@ -130,20 +130,15 @@ class Linear(Kern):
         tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * (2. * Z)[None, None, :, :])
         if self.ARD: grad += tmp.sum(0).sum(0).sum(0)
         else: grad += tmp.sum()
-        #from Kmm
-        self.update_gradients_full(dL_dKmm, Z, None)
-        self.variances.gradient += grad
 
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z):
-        # Kmm
-        grad = self.gradients_X(dL_dKmm, Z, None)
+    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         #psi1
-        grad += self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean)
+        grad = self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean)
         #psi2
         self._weave_dpsi2_dZ(dL_dpsi2, Z, variational_posterior, grad)
         return grad
 
-    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z):
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape)
         # psi0
         grad_mu += dL_dpsi0[:, None] * (2.0 * variational_posterior.mean * self.variances)
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index c80fb646..7c43b18d 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -55,10 +55,7 @@ class RBF(Stationary):
         self._psi_computations(Z, mu, S)
         return self._psi2
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        #contributions from Kmm
-        sself.update_gradients_full(dL_dKmm, Z)
-
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
         self._psi_computations(Z, mu, S)
@@ -87,7 +84,7 @@ class RBF(Stationary):
         else:
             self.lengthscale.gradient += dpsi2_dlength.sum(0).sum(0).sum(0)
 
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
         self._psi_computations(Z, mu, S)
@@ -104,11 +101,9 @@ class RBF(Stationary):
         dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
         grad += 2*(dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
 
-        grad += self.gradients_X(dL_dKmm, Z, None)
-
         return grad
 
-    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
         self._psi_computations(Z, mu, S)
diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index f4400ed7..135e3f9e 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -25,10 +25,10 @@ class Static(Kern):
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
 
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         return np.zeros(Z.shape)
 
-    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         return np.zeros(variational_posterior.shape), np.zeros(variational_posterior.shape)
 
     def psi0(self, Z, variational_posterior):
@@ -61,8 +61,8 @@ class White(Static):
     def update_gradients_diag(self, dL_dKdiag, X):
         self.variance.gradient = dL_dKdiag.sum()
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        self.variance.gradient = np.trace(dL_dKmm) + dL_dpsi0.sum()
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum()
 
 
 class Bias(Static):
@@ -86,6 +86,6 @@ class Bias(Static):
         ret[:] = self.variance**2
         return ret
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        self.variance.gradient = dL_dKmm.sum() + dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()
 
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index b998969c..2d0d284a 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -312,4 +312,8 @@ class RatQuad(Stationary):
         grad = np.sum(dL_dK*dK_dpow)
         self.power.gradient = grad
 
+    def update_gradients_diag(self, dL_dKdiag, X):
+        super(RatQuad, self).update_gradients_diag(dL_dKdiag, X)
+        self.power.gradient = 0.
+
 
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 50fc2810..366995dc 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -66,7 +66,7 @@ class BayesianGPLVM(SparseGP):
         super(BayesianGPLVM, self).parameters_changed()
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
 
-        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_q_variational(posterior_variational=self.X, Z=self.Z, **self.grad_dict)
+        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_q_variational(variational_posterior=self.X, Z=self.Z, **self.grad_dict)
 
         # update for the KL divergence
         self.variational_prior.update_gradients_KL(self.X)