diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 57ae9be7..2054881c 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -52,7 +52,7 @@ def debug_student_t_noise_approx():
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
-    initial_var_guess = 0.01
+    initial_var_guess = 1
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -84,14 +84,21 @@ def debug_student_t_noise_approx():
 
     edited_real_sd = initial_var_guess #real_sd
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m.ensure_default_constraints()
+    #m.constrain_positive('rbf')
+    m.constrain_fixed('rbf_v', 1.0898)
+    m.constrain_fixed('rbf_l', 1.8651)
+    m.constrain_positive('t_noi')
+    #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    m.optimize()
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    m.optimize('scg', messages=True)
     print(m)
+    return m
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
@@ -99,19 +106,19 @@ def debug_student_t_noise_approx():
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
 
-    print "Clean student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    if plot:
-        plt.subplot(133)
-        m.plot()
-        plt.plot(X_full, Y_full)
-        plt.ylim(-2.5, 2.5)
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #if plot:
+        #plt.subplot(133)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+        #plt.ylim(-2.5, 2.5)
 
     #plt.show()
 
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 208b1102..5b3e8f43 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -63,6 +63,7 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
+        #print "Setting laplace param with: ", p
         return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -78,10 +79,24 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
-        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        #Ki, _, _, _ = pdinv(self.K)
-        #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
+
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+
+        Ki, _, _, _ = pdinv(self.K)
+        #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+        #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+
+        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
+        #dytil_dfhat_explicit = self.Wi__Ki_W
+        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
+        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
+
+        a = mdot(dWi_dfhat, Ki, self.f_hat)
+        dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -94,18 +109,18 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-
         #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
 
-        print "Computing K gradients"
-        print "dytil_dfhat: ", np.mean(dytil_dfhat)
-        I = np.eye(self.N)
-        C = np.dot(self.K, self.W)
-        A = I + C
+        #print "Computing K gradients"
+        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
+        #I = np.eye(self.N)
+        #C = np.dot(self.K, self.W)
+        #A = I + C
         #plt.imshow(A)
         #plt.show()
 
         #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        #B = I + w12*K*w12
         I_KW_i = self.Bi # could use self.B_chol??
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
@@ -113,15 +128,22 @@ class Laplace(likelihood):
         dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
         grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
         for ind_j, thetaj in enumerate(dK_dthetaK):
-            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
+            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
+            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
 
+        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
+        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "\n"
+
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
         #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
         #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
         #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         dSigma_dthetaK_explicit = 0
@@ -140,19 +162,16 @@ class Laplace(likelihood):
         dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
         dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
         dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
-        #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
 
-        #print "\n"
-        #print "dL_dytil: ", np.mean(dL_dytil)
-        #print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
-        #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
-        #print "\n"
-        #print "dL_dSigma: ", np.mean(dL_dSigma)
-        #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
-        #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
-        #print "\n"
-        #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
-        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
+        print "\n"
+        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
+        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
+        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
+        print "\n"
+        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
 
         return np.squeeze(dL_dthetaK_implicit)
 
@@ -182,11 +201,15 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
         #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
-        dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
-        dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
-        dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
+        #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
+        #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
+        #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
+        KW_I_i = self.Bi # could use self.B_chol??
+        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
+
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
@@ -199,7 +222,7 @@ class Laplace(likelihood):
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
+        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
         dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
 
         #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
@@ -219,8 +242,10 @@ class Laplace(likelihood):
         #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
 
         dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
         dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
+        dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -257,7 +282,7 @@ class Laplace(likelihood):
         #((L.T*w)_i + I)f_hat = y_tilde
         L = jitchol(self.K)
         Li = chol_inv(L)
-        Lt_W = np.dot(L.T, self.W)
+        Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster
 
         ##Check it isn't singular!
         if cond(Lt_W) > epsilon:
@@ -361,7 +386,6 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
         L = jitchol(B)
         return (B, L, W_12)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 61c79385..6eef9f33 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -176,8 +176,6 @@ class student_t(likelihood_function):
 
     def _set_params(self, x):
         self.sigma = float(x)
-        print "Setting student t sigma: ", x
-        print x
         #self.covariance_matrix = np.eye(self.N)*self._variance
         #self.precision = 1./self._variance
 
@@ -288,7 +286,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
+        grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
         return np.squeeze(grad_sigma)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 79284b59..ff852766 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -66,6 +66,10 @@ class GP(model):
         # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
 
+        if isinstance(self.likelihood, Laplace):
+            print "Updating approx: ", p
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
 
         self.K = self.kern.K(self.X)
         self.K += self.likelihood.covariance_matrix
@@ -87,14 +91,12 @@ class GP(model):
         return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
 
     def _update_params_callback(self, p):
-        #FIXME:Check the transforming
-        #Set the new parameters of the kernel and likelihood within the optimization
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #parameters will be in transformed space
         self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        #set_params_transformed for likelihood doesn't exist?
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])
         #update the likelihood approximation within the optimisation with the current parameters
         self.update_likelihood_approximation()
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def update_likelihood_approximation(self):
         """
@@ -123,7 +125,9 @@ class GP(model):
         model for a new variable Y* = v_tilde/tau_tilde, with a covariance
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
-        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        print "Log likelihood: ", l
+        return l
 
     def _log_likelihood_gradients(self):
         """
@@ -135,7 +139,7 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.eye(self.dL_dK.shape[0])
+            fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
@@ -145,13 +149,11 @@ class GP(model):
             #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
 
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            #print "dL_dthetaL: ", dL_dthetaL
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         else:
-            #print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        #print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))