diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 6374a5fd..a1c71c71 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -16,6 +16,9 @@ def student_t_approx():
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
     Yc = Y.copy()
 
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
     #Y = Y/Y.max()
 
     Yc[10] += 100
@@ -25,7 +28,7 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 20 #100000.5
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
@@ -47,6 +50,8 @@ def student_t_approx():
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
 
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
@@ -58,6 +63,7 @@ def student_t_approx():
     # plot
     plt.subplot(211)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     #Corrupt
@@ -67,40 +73,64 @@ def student_t_approx():
     m.optimize()
     plt.subplot(212)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
-    # Likelihood object
+    print "Clean student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution)
-
-    print "Clean student t"
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
-    # optimize
     m.optimize()
     print(m)
-    # plot
-    plt.subplot(211)
+    plt.subplot(221)
     m.plot()
-    plt.ylim(-2.5,2.5)
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
 
-    print "Corrupt student t"
+    print "Corrupt student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Clean student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(222)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(212)
+    plt.subplot(224)
     m.plot()
-    plt.ylim(-2.5,2.5)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 1411c22b..8eb69869 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,16 +1,15 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, det
-from functools import partial
+from scipy.linalg import cholesky, eig, inv, det, cho_solve
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv,mdot
+from GPy.util.linalg import pdinv, mdot, jitchol
 #import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function):
+    def __init__(self, data, likelihood_function, rasm=True):
         """
         Laplace Approximation
 
@@ -30,6 +29,7 @@ class Laplace(likelihood):
         """
         self.data = data
         self.likelihood_function = likelihood_function
+        self.rasm = rasm
 
         #Inital values
         self.N, self.D = self.data.shape
@@ -102,20 +102,16 @@ class Laplace(likelihood):
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
         Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                    + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
                     + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
                     - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    )
 
-        self.Z = Z_tilde
-        self.Y = Y_tilde[:, None]
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        #if not self.likelihood_function.log_concave:
-            #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                   ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                   ##To cause the posterior to become less certain than the prior and likelihood,
-                                   ##This is a property only held by non-log-concave likelihoods
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
@@ -125,32 +121,15 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        f = np.zeros((self.N, 1))
-        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
-        LOG_K_CONST = -(0.5 * self.log_Kdet)
-        OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
-        #Find \hat(f) using a newton raphson optimizer for example
-        #TODO: Add newton-raphson as subclass of optimizer class
-
-        #FIXME: Can we get rid of this horrible reshaping?
-        def obj(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
-            return float(res)
-
-        def obj_grad(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
-            return np.squeeze(res)
-
-        def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
-            return np.squeeze(res)
-
-        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        self.Ki, _, _, self.log_Kdet = pdinv(K)
+        if self.rasm:
+            self.f_hat = self.rasm_mode(K)
+        else:
+            self.f_hat = self.ncg_mode(K)
 
         #At this point get the hessian matrix
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
+
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -176,8 +155,92 @@ class Laplace(likelihood):
         #Unsure whether its log_hess or log_hess_i
         self.ln_z_hat = (- 0.5*self.log_hess_hat_det
                          + 0.5*self.log_Kdet
-                         + self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat)
                          - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
+
+    def ncg_mode(self, K):
+        """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        self.K = K.copy()
+        f = np.zeros((self.N, 1))
+        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
+        LOG_K_CONST = -(0.5 * self.log_Kdet)
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+                        + self.NORMAL_CONST + LOG_K_CONST)
+            return float(res)
+
+        def obj_grad(f):
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            return np.squeeze(res)
+
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        return f_hat[:, None]
+
+    def rasm_mode(self, K):
+        """
+        Rasmussens numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        f = np.zeros((self.N, 1))
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(a, f):
+            #Careful of shape of data!
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
+
+        difference = np.inf
+        epsilon = 1e-16
+        step_size = 1
+        while difference > epsilon:
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f))
+            if not self.likelihood_function.log_concave:
+                #if np.any(W < 0):
+                    #print "NEGATIVE VALUES :("
+                    #pass
+                W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    #To cause the posterior to become less certain than the prior and likelihood,
+                                    #This is a property only held by non-log-concave likelihoods
+            #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+            W_12 = np.sqrt(W)
+            B = np.eye(self.N) + mdot(W_12, K, W_12)
+            L = jitchol(B)
+            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
+            #TODO: Check L is lower
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
+            a = b - mdot(W_12, solve_L)
+            f = np.dot(K, a)
+            old_obj = new_obj
+            new_obj = obj(a, f)
+            difference = new_obj - old_obj
+            #print "Difference: ", new_obj - old_obj
+            if difference < 0:
+                #If the objective function isn't rising, restart optimization
+                print "Reducing step-size, restarting"
+                #objective function isn't increasing, try reducing step size
+                step_size *= 0.9
+                f = np.zeros((self.N, 1))
+                new_obj = -np.inf
+                old_obj = np.inf
+
+            difference = abs(difference)
+
+        return f
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 50f9b620..15859a81 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -36,7 +36,10 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
+
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -44,6 +47,7 @@ class student_t(likelihood_function):
                      - (self.v + 1) * 0.5
                      * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                      )
+        print (e**2).shape
         return np.sum(objective)
 
     def link_grad(self, y, f):
@@ -57,10 +61,12 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return grad
+        return np.squeeze(grad)
 
     def link_hess(self, y, f):
         """
@@ -75,11 +81,12 @@ class student_t(likelihood_function):
         :f: latent variables f
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return hess
+        return np.squeeze(hess)
 
     def predictive_values(self, mu, var):
         """