diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/non_gaussian.py
similarity index 77%
rename from GPy/examples/laplace_approximations.py
rename to GPy/examples/non_gaussian.py
index f74e4d37..622b3edd 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/non_gaussian.py
@@ -2,22 +2,21 @@ import GPy
 import numpy as np
 import matplotlib.pyplot as plt
 from GPy.util import datasets
-#np.random.seed(1)
 
-def student_t_approx():
+def student_t_approx(optimize=True, plot=True):
     """
-    Example of regressing with a student t likelihood
+    Example of regressing with a student t likelihood using Laplace
     """
     real_std = 0.1
     #Start a function, any function
     X = np.linspace(0.0, np.pi*2, 100)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Y = Y/Y.max()
     Yc = Y.copy()
 
     X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
     Y_full = np.sin(X_full)
-
-    Y = Y/Y.max()
+    Y_full = Y_full/Y_full.max()
 
     #Slightly noisy data
     Yc[75:80] += 1
@@ -34,94 +33,93 @@ def student_t_approx():
     deg_free = 5
     print "Real noise: ", real_std
     initial_var_guess = 0.5
+    edited_real_sd = initial_var_guess
 
-    #t_rv = t(deg_free, loc=0, scale=real_var)
-    #noise = t_rvrvs(size=Y.shape)
-    #Y += noise
-
-    plt.figure(1)
-    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
-    kernel5 = kernel1.copy()
-    kernel6 = kernel1.copy()
 
-    print "Clean Gaussian"
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    m = GPy.models.GPRegression(X, Y, kernel=kernel1)
+    #Gaussian GP model on clean data
+    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
     # optimize
-    m.ensure_default_constraints()
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    m.optimize()
-    # plot
-    ax = plt.subplot(211)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Gaussian clean')
-    print m
+    m1.ensure_default_constraints()
+    m1.constrain_fixed('white', 1e-5)
+    m1.randomize()
 
-    #Corrupt
-    print "Corrupt Gaussian"
-    m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
-    m.ensure_default_constraints()
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    m.optimize()
-    ax = plt.subplot(212)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Gaussian corrupt')
-    print m
+    #Gaussian GP model on corrupt data
+    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
+    m2.ensure_default_constraints()
+    m2.constrain_fixed('white', 1e-5)
+    m2.randomize()
 
-    plt.figure(2)
-    plt.suptitle('Student-t likelihood')
-    edited_real_sd = initial_var_guess
-
-    print "Clean student t, rasm"
+    #Student t GP model on clean data
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
-    m.ensure_default_constraints()
-    m.constrain_positive('t_noise')
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    #m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    ax = plt.subplot(211)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Student-t rasm clean')
+    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    m3.ensure_default_constraints()
+    m3.constrain_bounded('t_noise', 1e-6, 10.)
+    m3.constrain_fixed('white', 1e-5)
+    m3.randomize()
 
-    print "Corrupt student t, rasm"
+    #Student t GP model on corrupt data
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
-    m.ensure_default_constraints()
-    m.constrain_bounded('t_noise', 1e-6, 10.)
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    for a in range(1):
-        m.randomize()
-        m_start = m.copy()
-        print m
-        m.optimize('scg', messages=1)
-    print(m)
-    ax = plt.subplot(212)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Student-t rasm corrupt')
+    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m4.ensure_default_constraints()
+    m4.constrain_bounded('t_noise', 1e-6, 10.)
+    m4.constrain_fixed('white', 1e-5)
+    m4.randomize()
 
-    return m
+    if optimize:
+        optimizer='scg'
+        print "Clean Gaussian"
+        m1.optimize(optimizer, messages=1)
+        print "Corrupt Gaussian"
+        m2.optimize(optimizer, messages=1)
+        print "Clean student t"
+        m3.optimize(optimizer, messages=1)
+        print "Corrupt student t"
+        m4.optimize(optimizer, messages=1)
+
+    if False:
+        print m1
+        print m3
+        plt.figure(3)
+        plt.scatter(X, m1.likelihood.Y, c='g')
+        plt.scatter(X, m3.likelihood.Y, c='r')
+
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        ax = plt.subplot(211)
+        m1.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian clean')
+
+        ax = plt.subplot(212)
+        m2.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian corrupt')
+
+        plt.figure(2)
+        plt.suptitle('Student-t likelihood')
+        ax = plt.subplot(211)
+        m3.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm clean')
+
+        ax = plt.subplot(212)
+        m4.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm corrupt')
+
+    return m1, m2, m3, m4
 
 def boston_example():
     import sklearn
@@ -294,3 +292,4 @@ def precipitation_example():
     for n, (train, test) in enumerate(kf):
         X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
         print "Fold {}".format(n)
+
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 57160d64..e5dcdd19 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -15,6 +15,7 @@ import scipy as sp
 from likelihood import likelihood
 from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
+import warnings
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -64,6 +65,7 @@ class Laplace(likelihood):
         self.YYT = None
 
         self.old_Ki_f = None
+        self.bad_fhat = False
 
     def predictive_values(self,mu,var,full_cov,**noise_args):
         if full_cov:
@@ -198,18 +200,16 @@ class Laplace(likelihood):
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W12BiW12
-        self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        self.lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
 
-        Z_tilde = (+ self.lik
+        Z_tilde = (+ lik
                    - 0.5*self.ln_B_det
-                   + 0.5*self.ln_det_Wi_K
+                   + 0.5*ln_det_Wi_K
                    - 0.5*self.f_Ki_f
-                   + 0.5*self.y_Wi_Ki_i_y
+                   + 0.5*y_Wi_K_i_y
                   )
-        #print "Term, {}, {}, {}, {}, {}".format(self.lik, - 0.5*self.ln_B_det, + 0.5*self.ln_det_Wi_K, - 0.5*self.f_Ki_f, + 0.5*self.y_Wi_Ki_i_y)
-
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
@@ -247,7 +247,10 @@ class Laplace(likelihood):
         #At this point get the hessian matrix (or vector as W is diagonal)
         self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
 
-        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
+            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+
         self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
 
         self.Ki_f = self.Ki_f
@@ -283,11 +286,11 @@ class Laplace(likelihood):
         except:
             import ipdb; ipdb.set_trace()
 
-        W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
-        return W12BiW12, ln_B_det
+        return W12BiW12a, ln_B_det
 
-    def rasm_mode(self, K, MAX_ITER=30):
+    def rasm_mode(self, K, MAX_ITER=40):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -302,9 +305,10 @@ class Laplace(likelihood):
         """
         #old_Ki_f = np.zeros((self.N, 1))
 
-        #Start f's at zero originally
-        if self.old_Ki_f is None:
-            old_Ki_f = np.zeros((self.N, 1))
+        #Start f's at zero originally of if we have gone off track, try restarting
+        if self.old_Ki_f is None or self.bad_fhat:
+            old_Ki_f = np.random.rand(self.N, 1)/50.0
+            #old_Ki_f = self.Y
             f = np.dot(K, old_Ki_f)
         else:
             #Start at the old best point
@@ -318,7 +322,7 @@ class Laplace(likelihood):
             return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-5
+        epsilon = 1e-7
         #step_size = 1
         #rs = 0
         i = 0
@@ -381,14 +385,20 @@ class Laplace(likelihood):
 
             #difference = abs(new_obj - old_obj)
             #old_obj = new_obj.copy()
-            difference = np.abs(np.sum(f - f_old))
-            #difference = np.abs(np.sum(Ki_f - old_Ki_f))
+            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
+            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
             old_Ki_f = Ki_f.copy()
             i += 1
 
         self.old_Ki_f = old_Ki_f.copy()
+
+        #Warn of bad fits
         if difference > epsilon:
-            print "Not perfect f_hat fit difference: {}".format(difference)
+            self.bad_fhat = True
+            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now perfect again")
 
         self.Ki_f = Ki_f
         return f