diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 704297ef..57ae9be7 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -36,6 +36,7 @@ def timing():
     print np.mean(the_is)
 
 def debug_student_t_noise_approx():
+    plot = False
     real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 30)[:, None]
@@ -57,8 +58,6 @@ def debug_student_t_noise_approx():
     #Y += noise
 
     plt.close('all')
-    plt.figure(1)
-    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1])
     kernel2 = kernel1.copy()
@@ -75,12 +74,14 @@ def debug_student_t_noise_approx():
     m.ensure_default_constraints()
     m.optimize()
     # plot
-    plt.subplot(131)
-    m.plot()
-    plt.plot(X_full, Y_full)
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        plt.subplot(131)
+        m.plot()
+        plt.plot(X_full, Y_full)
     print m
 
-    plt.suptitle('Student-t likelihood')
     edited_real_sd = initial_var_guess #real_sd
 
     print "Clean student t, rasm"
@@ -91,10 +92,12 @@ def debug_student_t_noise_approx():
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(132)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    if plot:
+        plt.suptitle('Student-t likelihood')
+        plt.subplot(132)
+        m.plot()
+        plt.plot(X_full, Y_full)
+        plt.ylim(-2.5, 2.5)
 
     print "Clean student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -104,12 +107,13 @@ def debug_student_t_noise_approx():
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(133)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    if plot:
+        plt.subplot(133)
+        m.plot()
+        plt.plot(X_full, Y_full)
+        plt.ylim(-2.5, 2.5)
 
-    plt.show()
+    #plt.show()
 
 def student_t_approx():
     """
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 934b2a90..566e4e25 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -5,8 +5,8 @@ from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
-from scipy.linalg.flapack import dtrtrs
-import pylab as plt
+from scipy.linalg.lapack import dtrtrs
+#import pylab as plt
 
 
 class Laplace(likelihood):
@@ -79,9 +79,9 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5?
+        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
         dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        #Ki = inv(self.K)
+        #Ki, _, _, _ = pdinv(self.K)
         #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
@@ -95,9 +95,8 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
 
-        dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
+        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
 
         print "Computing K gradients"
         print "dytil_dfhat: ", np.mean(dytil_dfhat)
@@ -107,7 +106,8 @@ class Laplace(likelihood):
         #plt.imshow(A)
         #plt.show()
 
-        I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        I_KW_i = self.Bi # could use self.B_chol??
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
         #Derivative for each f dimension, for each of K's hyper parameters
@@ -117,25 +117,44 @@ class Laplace(likelihood):
             dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
 
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
-        #FIXME: Careful dL_dK = dL_d_K_Sigma
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
         #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
-        dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
-        dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
-        for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
-            dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
-
-        print "dL_dytil: ", np.mean(dL_dytil)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
+        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
+        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        dSigma_dthetaK_explicit = 0
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+        #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
+        dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
+        dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
+        #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
+        #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
+            #dSigma_dthetaK_explicit = 0
+            #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
+            #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
 
         #FIXME: Won't handle multi dimensional data
         dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
-        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=(0,1))
+        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
         dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
         #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
+        #print "\n"
+        #print "dL_dytil: ", np.mean(dL_dytil)
+        #print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
+        #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
+        #print "\n"
+        #print "dL_dSigma: ", np.mean(dL_dSigma)
+        #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
+        #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
+        #print "\n"
+        #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
+        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
         return np.squeeze(dL_dthetaK_implicit)
 
     def _gradients(self, partial):
@@ -159,27 +178,51 @@ class Laplace(likelihood):
         dW_dthetaX = d_dthetaX[d2phi_d2fhat]
         d2phi_d2fhat = Hessian function of likelihood
 
-        partial = dL_dK
+        partial = dL_d_K_Sigma
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+
+        dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
+        dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
+        dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
+        dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+
+        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
+        dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
+
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        #-1 as we are looking at W which is -1*d2log p(y|f)
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi))
+
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
+        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
 
         #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         #Derivative for each f dimension, for each of K's hyper parameters
-        dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
-        for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
-            dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
-                                             dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
-                                             self.Sigma_tilde
-                                             )
+        #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
+        #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
+            #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
+                                             #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
+                                             #self.Sigma_tilde
+                                             #)
 
         #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
         #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
-        dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
 
         #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-        dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+        #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+
+        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
+
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index cd6467d7..2176aac0 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -248,17 +248,16 @@ class student_t(likelihood_function):
         """
         Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
+        $$\frac{2(v+1)((y-f)^{3} - 3\sigma^{2}v(y-f))}{((y-f)^{2} + \sigma^{2}v)^{3}}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
-        #NB f-y not y-f
-        e = f - y
-        d3link_d3f = (  (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+        e = y - f
+        d3link_d3f = (  (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
                       / ((e**2 + (self.sigma**2)*self.v)**3)
                      )
-        return d3link_d3f
+        return np.squeeze(d3link_d3f)
 
     def link_hess_grad_std(self, y, f, extra_data=None):
         """
@@ -270,10 +269,10 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        hess_grad_sigma = (  (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
+        hess_grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
                            / ((e**2 + (self.sigma**2)*self.v)**3)
                           )
-        return hess_grad_sigma
+        return np.squeeze(hess_grad_sigma)
 
     def link_grad_std(self, y, f, extra_data=None):
         """
@@ -288,11 +287,11 @@ class student_t(likelihood_function):
         grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
-        return grad_sigma
+        return np.squeeze(grad_sigma)
 
     def _gradients(self, y, f, extra_data=None):
-        return [self.link_grad_std(y, f, extra_data=extra_data)[:, None],
-                self.link_hess_grad_std(y, f, extra_data=extra_data)[:, None]] # list as we might learn many parameters
+        return [self.link_grad_std(y, f, extra_data=extra_data),
+                self.link_hess_grad_std(y, f, extra_data=extra_data)] # list as we might learn many parameters
 
     def predictive_values(self, mu, var):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index a346b47b..1682ee6c 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -125,19 +125,23 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.ones(self.dL_dK.shape)
+            fake_dL_dKs = np.eye(self.dL_dK.shape[0])
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
+            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
+            dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
 
-            print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
+            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
 
-            dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK)
-        else:
-            print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-        print "dL_dthetaL: ", dL_dthetaL
+            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        else:
+            #print "dL_dthetaK: ", dL_dthetaK
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        #print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))