diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 7677fea2..91d1f205 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -70,7 +70,7 @@ class SparseGP(GP):
             #gradients wrt Z
             self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
             self.Z.gradient += self.kern.gradients_Z_expectations(
-                               self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpis2'], Z=self.Z, variational_posterior=self.X)
+                               self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
         else:
             #gradients wrt kernel
             target = np.zeros(self.kern.size)
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index a2686d73..eab9a965 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -274,6 +274,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     Y = Ylist[0]
     k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    #k = kern.RBF(Q, ARD=True, lengthscale=10.)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
     
     if optimize:
@@ -281,7 +282,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
-        m.q.plot("BGPLVM Latent Space 1D")
+        m.X.plot("BGPLVM Latent Space 1D")
         m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
     return m
 
@@ -302,7 +303,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
     m.inference_method = VarDTCMissingData()
     m.Y[inan] = _np.nan
-    m.q.variance *= .1
+    m.X.variance *= .1
     m.parameters_changed()
     m.Yreal = Y
     
@@ -311,7 +312,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
-        m.q.plot("BGPLVM Latent Space 1D")
+        m.X.plot("BGPLVM Latent Space 1D")
         m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
     return m
 
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index e503180a..44f71d85 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -22,22 +22,25 @@ class Linear(Kern):
     :param input_dim: the number of input dimensions
     :type input_dim: int
     :param variances: the vector of variances :math:`\sigma^2_i`
-    :type variances: array or list of the appropriate size (or float if there is only one variance parameter)
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel has only one variance parameter \sigma^2, otherwise there is one variance parameter per dimension.
+    :type variances: array or list of the appropriate size (or float if there
+                     is only one variance parameter)
+    :param ARD: Auto Relevance Determination. If False, the kernel has only one
+                variance parameter \sigma^2, otherwise there is one variance
+                parameter per dimension.
     :type ARD: Boolean
     :rtype: kernel object
+
     """
 
     def __init__(self, input_dim, variances=None, ARD=False, name='linear'):
         super(Linear, self).__init__(input_dim, name)
         self.ARD = ARD
-        if ARD == False:
+        if not ARD:
             if variances is not None:
                 variances = np.asarray(variances)
                 assert variances.size == 1, "Only one variance needed for non-ARD kernel"
             else:
                 variances = np.ones(1)
-            self._Xcache, self._X2cache = np.empty(shape=(2,))
         else:
             if variances is not None:
                 variances = np.asarray(variances)
@@ -103,7 +106,6 @@ class Linear(Kern):
 
     #---------------------------------------#
     #             PSI statistics            #
-    #              variational              #
     #---------------------------------------#
 
     def psi0(self, Z, variational_posterior):
@@ -118,18 +120,16 @@ class Linear(Kern):
         return np.dot(ZAinner, ZA.T)
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        mu, S = variational_posterior.mean, variational_posterior.variance
+        #psi1
+        self.update_gradients_full(dL_dpsi1, variational_posterior.mean, Z)
         # psi0:
         tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior)
-        if self.ARD: grad = tmp.sum(0)
-        else: grad = np.atleast_1d(tmp.sum())
-        #psi1
-        self.update_gradients_full(dL_dpsi1, mu, Z)
-        grad += self.variances.gradient
+        if self.ARD: self.variances.gradient += tmp.sum(0)
+        else: self.variances.gradient += tmp.sum()
         #psi2
         tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * (2. * Z)[None, None, :, :])
-        if self.ARD: grad += tmp.sum(0).sum(0).sum(0)
-        else: grad += tmp.sum()
+        if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
+        else: self.variances.gradient += tmp.sum()
 
     def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         #psi1
@@ -155,7 +155,7 @@ class Linear(Kern):
     #--------------------------------------------------#
 
 
-    def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, pv, target_mu, target_S):
+    def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, vp, target_mu, target_S):
         # Think N,num_inducing,num_inducing,input_dim
         ZA = Z * self.variances
         AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :]
@@ -198,16 +198,15 @@ class Linear(Kern):
         weave_options = {'headers'           : ['<omp.h>'],
                          'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                          'extra_link_args'   : ['-lgomp']}
-        
-        mu = pv.mean
+        mu = vp.mean
         N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu)
         weave.inline(code, support_code=support_code, libraries=['gomp'],
                      arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
                      type_converters=weave.converters.blitz,**weave_options)
 
 
-    def _weave_dpsi2_dZ(self, dL_dpsi2, Z, pv, target):
-        AZA = self.variances*self._ZAinner(pv, Z)
+    def _weave_dpsi2_dZ(self, dL_dpsi2, Z, vp, target):
+        AZA = self.variances*self._ZAinner(vp, Z)
         code="""
         int n,m,mm,q;
         #pragma omp parallel for private(n,mm,q)
@@ -229,23 +228,23 @@ class Linear(Kern):
                          'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                          'extra_link_args'   : ['-lgomp']}
 
-        N,num_inducing,input_dim = pv.mean.shape[0],Z.shape[0],pv.mean.shape[1]
-        mu = param_to_array(pv.mean)
+        N,num_inducing,input_dim = vp.mean.shape[0],Z.shape[0],vp.mean.shape[1]
+        mu = param_to_array(vp.mean)
         weave.inline(code, support_code=support_code, libraries=['gomp'],
                      arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
                      type_converters=weave.converters.blitz,**weave_options)
 
 
-    def _mu2S(self, pv):
-        return np.square(pv.mean) + pv.variance
+    def _mu2S(self, vp):
+        return np.square(vp.mean) + vp.variance
 
-    def _ZAinner(self, pv, Z):
+    def _ZAinner(self, vp, Z):
         ZA = Z*self.variances
-        inner = (pv.mean[:, None, :] * pv.mean[:, :, None])
-        diag_indices = np.diag_indices(pv.mean.shape[1], 2)
-        inner[:, diag_indices[0], diag_indices[1]] += pv.variance
+        inner = (vp.mean[:, None, :] * vp.mean[:, :, None])
+        diag_indices = np.diag_indices(vp.mean.shape[1], 2)
+        inner[:, diag_indices[0], diag_indices[1]] += vp.variance
 
-        return np.dot(ZA, inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
+        return np.dot(ZA, inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x num_data x input_dim]!
 
     def input_sensitivity(self):
         if self.ARD: return self.variances
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 366995dc..5fed767b 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -66,7 +66,7 @@ class BayesianGPLVM(SparseGP):
         super(BayesianGPLVM, self).parameters_changed()
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
 
-        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_q_variational(variational_posterior=self.X, Z=self.Z, **self.grad_dict)
+        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, **self.grad_dict)
 
         # update for the KL divergence
         self.variational_prior.update_gradients_KL(self.X)