diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 3152d4b5..ab1f3bf0 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -53,26 +53,45 @@ class SparseGP(GP):
         GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
 
         self.Z = Param('inducing inputs', self.Z)
-        self.add_parameter(self.Z, gradient=self.dL_dZ, index=0)
-        self.add_parameter(self.kern, gradient=self.dL_dtheta)
-        self.add_parameter(self.likelihood, gradient=lambda:self.likelihood._gradients(partial=self.partial_for_likelihood))
+        self.add_parameter(self.Z, index=0)
+        self.add_parameter(self.kern)
+        self.add_parameter(self.likelihood)
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
 
-                #The derivative of the bound wrt the inducing inputs Z
-        self.Z.gradient = self.kern.dK_dX(self.dL_dKmm, self.Z)
-        if self.has_uncertain_inputs:
+        #The derivative of the bound wrt the inducing inputs Z
+        self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+        if self.X_variance is None:
+            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
+        else:
             self.Z.gradient += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
             self.Z.gradient += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
-        else:
-            self.Z.gradient += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
 
     def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
         """
         Make a prediction for the latent function values
         """
-        #TODO!!!
+        if X_variance_new is None:
+            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            if full_cov:
+                Kxx = self.kern.K(Xnew, which_parts=which_parts)
+                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
+        else:
+            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
+            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
+            mu = np.dot(Kx, self.Cpsi1V)
+            if full_cov:
+                raise NotImplementedError, "TODO"
+            else:
+                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
+                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
+                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
+        return mu, var[:,None]
 
 
     def _getstate(self):
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 82de1419..3e5022aa 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs
+from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
 
 class Posterior(object):
     """
@@ -62,6 +62,7 @@ class Posterior(object):
 
         #compute this lazily
         self._precision = None
+        self._woodbury_inv = None
 
     @property
     def mean(self):
@@ -91,6 +92,14 @@ class Posterior(object):
             _, _, self._woodbury_chol, _ = pdinv(Wi)
         return self._woodbury_chol
 
+    @property
+    def woodbury_inv(self):
+        if self._woodbury_inv is None:
+            self._woodbury_inv, _ = dpotri(self.woodbury_chol)
+            symmetrify(self._woodbury_inv)
+        return self._woodbury_inv
+
+
     @property
     def woodbury_vector(self):
         if self._woodbury_vector is None:
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index 7ceeff11..b5ba4c2d 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
-from ...util.linalg import jitchol, backsub_both_sides, tdot
+from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
 import numpy as np
 log_2_pi = np.log(2*np.pi)
 
@@ -30,7 +30,7 @@ class VarDTC(object):
         if (N>D):
             return Y
         else:
-            #if Y in self.cache, return self.Cache[Y], else stor Y in cache and return L.
+            #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
             raise NotImplementedError, 'TODO' #TODO
 
     def get_VVTfactor(self, Y, prec):
@@ -66,9 +66,9 @@ class VarDTC(object):
         # The rather complex computations of A
         if uncertain_inputs:
             if het_noise:
-                psi2_beta = (psi2 * (likelihood.precision.flatten().reshape(num_data, 1, 1))).sum(0)
+                psi2_beta = (psi2 * (beta.flatten().reshape(num_data, 1, 1))).sum(0)
             else:
-                psi2_beta = psi2.sum(0) * likelihood.precision
+                psi2_beta = psi2.sum(0) * beta
             evals, evecs = linalg.eigh(psi2_beta)
             clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
             if not np.array_equal(evals, clipped_evals):
@@ -89,6 +89,7 @@ class VarDTC(object):
 
         # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
         VVT_factor = self.get_VVTfactor(Y, beta)
+        trYYT = np.sum(np.square(Y))
         psi1Vf = np.dot(psi1.T, VVT_factor)
 
         # back substutue C into psi1Vf
@@ -97,17 +98,6 @@ class VarDTC(object):
         tmp, info2 = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
         Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1)
 
-        #compute log marginal likelihood
-        if het_noise:
-            A = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
-            B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
-        else:
-            A = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * likelihood.trYYT
-            B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
-        C = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
-        D = 0.5 * data_fit
-        log_marginal = A + B + C + D
-
 
         # Compute dL_dKmm
         tmp = tdot(_LBi_Lmi_psi1Vf)
@@ -120,17 +110,17 @@ class VarDTC(object):
 
         # Compute dL_dpsi 
         dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
-        dL_dpsi1 = np.dot(likelihood.VVT_factor, Cpsi1Vf.T)
+        dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
         dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
 
         if het_noise:
             if uncertain_inputs:
                 dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
             else:
-                dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * likelihood.precision.reshape(num_data, 1)).T).T
+                dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
                 dL_dpsi2 = None
         else:
-            dL_dpsi2 = likelihood.precision * dL_dpsi2_beta
+            dL_dpsi2 = beta * dL_dpsi2_beta
             if uncertain_inputs:
                 # repeat for each of the N psi_2 matrices
                 dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
@@ -152,40 +142,55 @@ class VarDTC(object):
                 Lmi_psi1, nil = dtrtrs(Lm, np.asfortranarray(psi1.T), lower=1, trans=0)
                 _LBi_Lmi_psi1, _ = dtrtrs(LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
 
-                partial_for_likelihood = -0.5 * likelihood.precision + 0.5 * likelihood.V**2
-                partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * likelihood.precision**2
+                partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2
+                partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
 
-                partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*likelihood.precision**2
+                partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
 
-                partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * likelihood.precision**2
-                partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * likelihood.precision**2
+                partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
+                partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
 
         else:
             # likelihood is not heteroscedatic
-            partial_for_likelihood = -0.5 * num_data * output_dim * likelihood.precision + 0.5 * likelihood.trYYT * likelihood.precision ** 2
-            partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * likelihood.precision ** 2 - np.trace(_A) * likelihood.precision)
-            partial_for_likelihood += likelihood.precision * (0.5 * np.sum(_A * DBi_plus_BiPBi) - data_fit)
+            partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+            partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+            partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+
+        #compute log marginal likelihood
+        if het_noise:
+            lik_1 = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
+            lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+        else:
+            lik_1 = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * trYYT
+            lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+        lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
+        lik_4 = 0.5 * data_fit
+        log_marginal = lik_1 + lik_2 + lik_3 + lik_4
 
         #put the gradients in the right places
-        likelihood.update_gradients(np.diag(partial_for_likelihood))
+        likelihood.update_gradients(partial_for_likelihood)
 
         if uncertain_inputs:
-            kern.update_gradients_variational(??)
             grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
         else:
-            kern.update_gradients_sparse(??)
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
+            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
 
         #get sufficient things for posterior prediction
         if VVT_factor.shape[1] == Y.shape[1]:
             woodbury_vector = Cpsi1Vf # == Cpsi1V
-            woodbury_chol = ??
         else:
-            raise NotImplementedError #TODO
+            psi1V = np.dot(Y.T*beta, psi1).T
+            tmp, _ = dtrtrs(Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = dpotrs(LB, tmp, lower=1)
+            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        #TODO: totally wrong, fix.
+        woodbury_chol = np.eye(num_inducing)
 
         #construct a posterior object
         post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
 
-        return Posterior, log_marginal, grad_dict
+        return post, log_marginal, grad_dict
 
 
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index de87ff14..68b26e3c 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -284,9 +284,10 @@ class kern(Parameterized):
         [p.update_gradients_full(dL_dK, X) for p in self._parameters_]
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        raise NotImplementedError
+        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
+
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        raise NotImplementedError
+        [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
 
     def dK_dtheta(self, dL_dK, X, X2=None):
         """
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 885fed96..89f6894c 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -117,7 +117,7 @@ class RBF(Kernpart):
             self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
 
         else:
-            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
+            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
 
         #from Kmm
         self._K_computations(Z, None)
diff --git a/GPy/kern/parts/white.py b/GPy/kern/parts/white.py
index a59c9f98..c9677f28 100644
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@@ -32,7 +32,7 @@ class White(Kernpart):
         self.variance.gradient = np.trace(dL_dK)
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        raise NotImplementedError
+        self.variance.gradient = np.trace(dL_dKmm) + np.sum(dL_dKdiag)
 
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         raise NotImplementedError